首页 > > 网络编程 > 其它 >

唯一图库爬取图片

2019-03-01 10:14:26来源：博客园阅读 ()

项目一：唯一图库

项目概述：根据需要到唯一图库爬取图片

所用技术：scrapy,urllib，字符串处理，百分号格式化

爬虫程序根据setting和item配置把爬取到的数据交给pipline处理

相关操作记录如下：

爬虫程序

 1 #!/usr/bin/env python
 2 #-*- coding:utf-8 -*-
 3 #s1.py
 4 import scrapy
 5 from scrapy.selector import HtmlXPathSelector
 6 from spider1 import items
 7 
 8 class LL(scrapy.spiders.Spider):
 9     name = 'xx'
10     start_urls=['http://www.mmonly.cc/sgtp/',]
11     def parse(self,response):
12         hxs = HtmlXPathSelector(response)
13 
14         item = items.Spider1Item()
15         item['names'] = hxs.select('//div[@class="item_t"]//img/@alt').extract()
16         item['imgs'] = hxs.select('//div[@class="item_t"]//img/@src').extract()
17         yield item

View Code

settings.py

 1 # -*- coding: utf-8 -*-
 2 #settings.py
 3 BOT_NAME = 'spider1'
 4 #
 5 SPIDER_MODULES = ['spider1.spiders']
 6 NEWSPIDER_MODULE = 'spider1.spiders'
 7 ROBOTSTXT_OBEY = True
 8 ITEM_PIPELINES = {
 9    'spider1.pipelines.Spider1Pipeline': 100,
10 }

View Code

items.py

 1 # -*- coding: utf-8 -*-
 2 #items.py
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class Spider1Item(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     names = scrapy.Field()
15     imgs = scrapy.Field()

View Code

pipelines.py

 1 # -*- coding: utf-8 -*-
 2 #pipelines.py
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 
 9 class Spider1Pipeline(object):
10     def process_item(self, item, spider):
11 
12         names = item['names']
13         imgs = item['imgs']
14 
15         print names[0],imgs[0]
16         for i in range(len(names)):
17             if names[i] and imgs[i]:
18                 img_name = names[i] + '.jpg'
19                 c = str(imgs[i])
20                 if c.startswith('h'):
21                     net_url = c
22                 else:
23                     net_url = 'http://www.xiaohuar.com' + c
24                 local_file = 'C:\\Users\\wenxianfeng\\Desktop\\img\\%s' % img_name
25                 import urllib
26                 urllib.urlretrieve(net_url, local_file)
27 
28         return item