scrapy-redis 分布式哔哩哔哩网站用户爬虫
2019-05-22 06:32:30来源:博客园 阅读 ()
scrapy里面,对每次请求的url都有一个指纹,这个指纹就是判断url是否被请求过的。默认是开启指纹即一个URL请求一次。如果我们使用分布式在多台机上面爬取数据,为了让爬虫的数据不重复,我们也需要一个指纹。但是scrapy默认的指纹是保持到本地的。所有我们可以使用redis来保持指纹,并且用redis里面的set集合来判断是否重复。
setting.py
# -*- coding: utf-8 -*- # Scrapy settings for bilibili project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'bilibili' SPIDER_MODULES = ['bilibili.spiders'] NEWSPIDER_MODULE = 'bilibili.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'bilibili (+http://www.yourdomain.com)' # Obey robots.txt rules # ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'bilibili.middlewares.BilibiliSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'bilibili.middlewares.BilibiliDownloaderMiddleware': 543, 'bilibili.middlewares.randomUserAgentMiddleware':400 } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'bilibili.pipelines.BilibiliPipeline': 300, 'scrapy_redis.pipelines.RedisPipeline':300 } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' SCHEDULER = 'scrapy_redis.scheduler.Scheduler' DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' REDIS_URL = 'redis://@127.0.0.1:6379' SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
spider.py
# -*- coding: utf-8 -*- import scrapy import json,re from bilibili.items import BilibiliItem class BilibiliappSpider(scrapy.Spider): name = 'bilibiliapp' # allowed_domains = ['www.bilibili.com'] # start_urls = ['http://www.bilibili.com/'] def start_requests(self): for i in range(1, 300): url = 'https://api.bilibili.com/x/relation/stat?vmid={}&jsonp=jsonp&callback=__jp3'.format(i) url_ajax = 'https://space.bilibili.com/{}/'.format(i) # get的时候是这个东东, scrapy.Request(url=, callback=) req = scrapy.Request(url=url,callback=self.parse,meta={'id':i}) req.headers['referer'] = url_ajax yield req def parse(self, response): # print(response.text) comm = re.compile(r'({.*})') text = re.findall(comm,response.text)[0] data = json.loads(text) # print(data) follower = data['data']['follower'] following = data['data']['following'] id = response.meta.get('id') url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&page=1&pagesize=25'.format(id) yield scrapy.Request(url=url,callback=self.getsubmit,meta={ 'id':id, 'follower':follower, 'following':following }) def getsubmit(self, response): # print(response.text) data = json.loads(response.text) tilst = data['data']['tlist'] tlist_list = [] if tilst != []: # print(tilst) for tils in tilst.values(): # print(tils['name']) tlist_list.append(tils['name']) else: tlist_list = ['无爱好'] follower = response.meta.get('follower') following = response.meta.get('following') id = response.meta.get('id') url = 'https://api.bilibili.com/x/space/acc/info?mid={}&jsonp=jsonp'.format(id) yield scrapy.Request(url=url,callback=self.space,meta={ 'id':id, 'follower':follower, 'following':following, 'tlist_list':tlist_list }) def space(self, respinse): # print(respinse.text) data = json.loads(respinse.text) name = data['data']['name'] sex = data['data']['sex'] level = data['data']['level'] birthday = data['data']['birthday'] tlist_list = respinse.meta.get('tlist_list') animation = 0 Life = 0 Music = 0 Game = 0 Dance = 0 Documentary = 0 Ghost = 0 science = 0 Opera = 0 entertainment = 0 Movies = 0 National = 0 Digital = 0 fashion = 0 for tlist in tlist_list: if tlist == '动画': animation = 1 elif tlist == '生活': Life = 1 elif tlist == '音乐': Music = 1 elif tlist == '游戏': Game = 1 elif tlist == '舞蹈': Dance = 1 elif tlist == '纪录片': Documentary = 1 elif tlist == '鬼畜': Ghost = 1 elif tlist == '科技': science = 1 elif tlist == '番剧': Opera =1 elif tlist == '娱乐': entertainment = 1 elif tlist == '影视': Movies = 1 elif tlist == '国创': National = 1 elif tlist == '数码': Digital = 1 elif tlist == '时尚': fashion = 1 item = BilibiliItem() item['name'] = name item['sex'] = sex item['level'] = level item['birthday'] = birthday item['follower'] = respinse.meta.get('follower') item['following'] = respinse.meta.get('following') item['animation'] = animation item['Life'] = Life item['Music'] = Music item['Game'] = Game item['Dance'] = Dance item['Documentary'] = Documentary item['Ghost'] = Ghost item['science'] = science item['Opera'] = Opera item['entertainment'] = entertainment item['Movies'] = Movies item['National'] = National item['Digital'] = Digital item['fashion'] = fashion yield item
设置ua池
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware import random class randomUserAgentMiddleware(UserAgentMiddleware): def __init__(self,user_agent=''): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: request.headers.setdefault('User-Agent', ua) user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
git地址:https://github.com/18370652038/scrapy-bilibili
原文链接:https://www.cnblogs.com/dayouzi/p/10889789.html
如有疑问请与原作者联系
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
- celery 分布式异步任务框架(celery简单使用、celery多任务结 2019-07-24
- 分布式设计 2019-07-24
- django-Celery分布式队列简单使用 2019-07-24
- Python分布式爬虫抓取知乎用户信息并进行数据分析 2019-03-10
- pytest-xdist分布式执行测试用例 2019-02-20
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash