python爬虫成长之路(三):基础爬虫架构及爬取…

2018-06-22 00:58:16来源:未知 阅读 ()

新老客户大回馈,云服务器低至5折

       爬虫成长之路(一)里我们介绍了如何爬取证券之星网站上所有A股数据,主要涉及网页获取和页面解析的知识。爬虫成长之路(二)里我们介绍了如何获取代理IP并验证,涉及了多线程编程和数据存储的知识。此次我们将在前两节的基础上,对证券之星全站的行情数据进行爬取。第一节的思路爬一个栏目的数据尚可,爬上百个栏目的数据工作量就有点大了。下面我们先介绍下基础的爬虫架构。

       本文主要包含爬虫框架六大基础模块,分别为爬虫调度器、URL下载器、URL管理器、HTML下载器、HTML解析器、数据存储器。功能分析如下

       爬虫调度器:主要负责统筹其他四个模块的工作。

       URL下载器:主要负责下载需要爬取数据的URL链接。

       URL管理器:负责管理URL链接,维护已经爬取的URL集合和未爬取的URL集合,提供获取新URL链接的接口。

       HTML下载器:用于从URL管理器中获取未爬取的URL链接并下载HRML网页。

       HTML解析器:用户从HTML下载器中获取已经下载的HTML网页,解析出有效数据交给数据存储器。

       数据存储器:用于将HTML解析器解析出来的数据通过文件或者数据库的形式储存起来。

       为了方便理解,以下是基础爬虫框架运行流程示意图

      

       此处介绍文件夹,下面,我们对这6大模块进行详细的介绍。

       一、URL下载器

       URL下载器包含两步,首先下载网站左侧导航栏的URL,然后通过导航栏的URL获取每个子栏目包含的链接列表。

      

       下面是获取左侧导航栏所有链接并生成导航文件的代码

# -*- coding: utf-8 -*-
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import re
import os
class get_catalog(object):
    '''生成和操作导航文件'''
    def save_catalog(self):
        '''获得证券之星左侧自导航的内容和网址并保存'''
        #获取网页内容
        url = 'http://quote.stockstar.com'
        request =urllib.request.Request(url = url)
        response = urllib.request.urlopen(request)
        content = response.read().decode('gbk')
        #截取左侧导航内容
        soup = BeautifulSoup(content,"lxml")
        soup = BeautifulSoup(str(soup.find_all('div',class_ = "subMenuBox")),"lxml")
        #初始化一级子目录和二级子目录的数据框
        catalog1 = pd.DataFrame(columns = ["cata1","cata2","url2"])
        catalog2 = pd.DataFrame(columns = ["url2","cata3","url3"])
        #整理目录内容和其对应的链接
        index1 = 0;index2 = 0
        for content1 in soup.find_all('div',class_ = re.compile("list submenu?")):
            cata1 = re.findall('>(.*?)<',str(content1.h3.a))
            for content2 in content1.find_all('dl'):
                cata2 = re.findall('>(.*?)<',str(content2.dt.a).replace('\r\n',''))
                url2 = url + content2.dt.a['href']
                catalog1.loc[index1] = {'cata1':cata1[0],'cata2':cata2[0].split()[0],'url2':url2}
                index1 += 1
                for content3 in content2.find_all('li'):
                    cata3 = re.findall('·(.*?)<',str(content3.a))
                    url3 = url + content3.a['href']
                    catalog2.loc[index2] = {'url2':url2,'cata3':cata3[0],'url3':url3}
                    index2 += 1
        #对一级子目录表和二级子目录表做表连接并保存
        catalog = pd.merge(catalog1,catalog2,on='url2',how='left')
        catalog.to_csv('catalog.csv')
        
    def load_catalog(self):
        '''判断导航文件是否存在并载入'''
        if 'catalog.csv' not in os.listdir():
            self.save_catalog()
            print('网址导航文件已生成')
        else:
            print('网址导航文件已存在')
        catalog = pd.read_csv('catalog.csv',encoding='gbk',usecols=range(1,6))
        print("网址导航文件已载入")
        return(catalog)

    def index_info(self,catalog,index):
        '''创建每行的行名,作为存入数据库的表名,并获取每行终端的网址链接'''
        if str(catalog.loc[index]['cata3'])=='nan':
            table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2']
            url = catalog.loc[index]['url2']
        else:
            #+、()等符号不能作为数据库表名,得替换或剔除
            if '+' in catalog.loc[index]['cata3']:
                cata3 = catalog.loc[index]['cata3'].replace('+','')
                table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3
            elif '(' in catalog.loc[index]['cata3']:
                cata3 = catalog.loc[index]['cata3'].replace('(','').replace(')','')
                table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + cata3
            else:
                table_name = catalog.loc[index]['cata1'] + '_' + catalog.loc[index]['cata2'] + '_' + catalog.loc[index]['cata3']
            url = catalog.loc[index]['url3']
        return(table_name,url)
get_catalog

       下面是获取每个子栏目所有链接的代码

import pandas as pd
from selenium import webdriver
import time
import re
import math
from get_catalog import get_catalog
class get_urls(object):
    '''获取每个栏目的链接列表'''
    def __init__(self,browser,url):
        self.browser = browser     #浏览器对象
        self.url = url             #待爬取的URL
        
    def get_browser(self):
        '''连接URL'''
        state = 0
        test = 0
        while state == 0 and test < 5:
            try:
                self.browser.get(self.url)
                state = 1
                print('成功连接 %s'%self.url)
            except:
                test += 1

    def get_element(self):
        '''获取翻页相关按钮的链接列表'''
        self.get_browser()
        element_list=[]
        for i in range(1,8):
            try:                                       
                element = self.browser.find_element_by_xpath('//*[@id="divPageControl1"]/a[%d]'%i).get_attribute('href')
                element_list.append(element)
            except:
                time.sleep(0.2)
        return(element_list)

    def get_urllist(self):
        '''通过翻页相关按钮生成有效的页码链接列表'''
        element_list = self.get_element()
        if len(element_list)<=1:
            urls = [self.url]
        else:
            try:
                max_number = re.search('_(\d*)\.',element_list[len(element_list)-3])
                begin = max_number.start() + 1
                end = max_number.end() - 1
                int_max_number = int(element_list[len(element_list)-3][begin:end])
                urls = []
                for i in range(1,int_max_number + 1):
                    url = element_list[len(element_list)-3][:begin] + str(i) + element_list[len(element_list)-3][end:]
                    urls.append(url)
            except:
                urls = [self.url]
        return(urls)
get_urls

       二、URL管理器

       URL管理器主要包括两个变量,一个是已爬取的URL的 集合,另外一个是未爬取的URL的集合。采用Python中的set类型,主要是使用set的去重功能。

       URL管理器除了具有两个URL集合,还需要提供以下接口,用于配合其他模块使用,接口如下:

       判断是否有待取的URL,方法定义为has_new_url()。

       添加新的URL到未爬取集合中,方法定义为add_new_url(url),add_new_urls(urls)。

       获取一个未爬取的URL,方法定义为get_new_url()

       下面为URL管理器模块的代码

# coding:utf - 8
class UrlManager(object):
    '''URL管理器'''
    def __init__(self):
        self.new_urls = set() #未爬取URL集合
        self.old_urls = set() #已爬取URL
    def has_new_url(self):
        '''判断是否有未爬取的URL'''
        return(self.new_url_size()!=0)
    def get_new_url(self):
        '''获取一个未爬取的URL'''
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return(new_url)
    def add_new_url(self,url):
        '''将新的URL添加到未爬取的URL集合中'''
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)
    def add_new_urls(self,urls):
        '''将新的URL列表添加到未爬取的URL集合中'''
        if urls is None or len(urls)==0:
            return
        for url in urls:
            self.add_new_url(url)
    def new_url_size(self):
        '''获取为爬取URL集合的大小'''
        return(len(self.new_urls))
UrlManager

       三、HTML下载器

       HTML下载器用来下载网页,这时候需要注意网页的编码,已保证下载的网页没有乱码。

       获取网页内容时可能会遇到IP被封的情况,所以我们得爬取一个代理IP池,供HTML下载器使用。

       下面是获取代理IP池的代码

import urllib.request
import re
import time
import random
import socket
import threading

class proxy_ip(object):
    '''获取有效代理IP并保存'''
    def __init__(self,url,total_page):
        self.url = url                   #打算爬取的网址
        self.total_page = total_page     #遍历代理IP网页的页数
        
    def get_proxys(self):
        '''抓取代理IP'''
        user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
              'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
              'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
              'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
        ip_totle=[]
        for page in range(1,self.total_page+1):
            #url = 'http://www.httpsdaili.com/?page='+str(page)
            #url='http://www.kuaidaili.com/free/inha/'+str(page)+'/'
            url='http://www.xicidaili.com/nn/'+str(page) #西刺代理
            headers={"User-Agent":random.choice(user_agent)}
            try:
                request=urllib.request.Request(url=url,headers=headers)
                response=urllib.request.urlopen(request)
                content=response.read().decode('utf-8')
                print('get page',page)
                pattern=re.compile('<td>(\d.*?)</td>')  #截取<td>与</td>之间第一个数为数字的内容
                ip_page=re.findall(pattern,str(content))
                ip_totle.extend(ip_page)
            except Exception as e:
                print(e)
            time.sleep(random.choice(range(1,5)))
        #打印抓取内容
        print('代理IP地址     ','\t','端口','\t','速度','\t','验证时间')
        for i in range(0,len(ip_totle),4):
            print(ip_totle[i],'    ','\t',ip_totle[i+1],'\t',ip_totle[i+2],'\t',ip_totle[i+3])
        #整理代理IP格式
        proxys = []
        for i in range(0,len(ip_totle),4):
            proxy_host = ip_totle[i]+':'+ip_totle[i+1]
            proxy_temp = {"http":proxy_host}
            proxys.append(proxy_temp)
        return(proxys)

    def test(self,lock,proxys,i,f):
        '''验证代理IP有效性'''
        socket.setdefaulttimeout(15)  #设置全局超时时间
        url = self.url  
        try:
            proxy_support = urllib.request.ProxyHandler(proxys[i])
            opener = urllib.request.build_opener(proxy_support)
            opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")]
            urllib.request.install_opener(opener)
            #res = urllib.request.urlopen(url).read().decode('gbk')
            res = urllib.request.urlopen(url).read().decode('utf-8')
            print(res)
            lock.acquire()     #获得锁
            print(proxys[i],'is OK')
            f.write('%s\n' %str(proxys[i]))  #写入该代理IP
            lock.release()     #释放锁
        except Exception as e:
            lock.acquire()
            print(proxys[i],e)
            lock.release()
            
    def get_ip(self):
        '''多线程验证'''
        f = open('proxy_ip.txt','a+')  #新建一个储存有效IP的文档
        lock=threading.Lock()  #建立一个锁
        #多线程验证
        proxys = self.get_proxys()
        threads=[]
        for i in range(len(proxys)):
            thread=threading.Thread(target=self.test,args=[lock,proxys,i,f])
            threads.append(thread)
            thread.start()
        #阻塞主进程,等待所有子线程结束
        for thread in threads:
            thread.join()           
        f.close()  #关闭文件
get_proxy_ip

       下面是HTML下载器模块的代码

# _*_ coding:utf-8 _*_
from firstSpider.get_proxy_ip import proxy_ip
import urllib.request
import random
import os
import socket
import time
import re
class HtmlDownloader(object):
    '''获取网页内容'''
    def download(self,url):
        user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
                      'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                      'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                      'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                      'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                      'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                      'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                      'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                      'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                      'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                      'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
        state = 0;test = 0
        socket.setdefaulttimeout(20)  #设置全局超时时间
        while state == 0 and test < 5:
            try:
                request = urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素
                response = urllib.request.urlopen(request)
                readhtml = response.read()
                content = readhtml.decode('gbk')       #读取网页内容
                time.sleep(random.randrange(1,6))
                if re.search('Auth Result',content) == None:
                    state = 1
            except Exception as e:
                print('系统IP获取网页失败','',e)
                if 'proxy_ip.txt' not in os.listdir() or os.path.getsize('proxy_ip.txt') == 0:
                    print('代理IP池不存在,新建代理IP池')
                    pool = proxy_ip(url,5)
                    pool.get_ip()
                    print('代理IP池创建完毕')
                else:
                    f = open('proxy_ip.txt','r')
                    proxys_ip = f.readlines()
                    f.close()
                    random.shuffle(proxys_ip)
                    for i in range(len(proxys_ip)):
                        try:
                            proxy_support = urllib.request.ProxyHandler(eval(proxys_ip[i][:-1]))
                            opener = urllib.request.build_opener(proxy_support)
                            opener.addheaders=[("User-Agent",random.choice(user_agent))]
                            urllib.request.install_opener(opener)
                            response = urllib.request.urlopen(url)
                            readhtml = response.read()
                            content = readhtml.decode('gbk')
                            time.sleep(random.randrange(1,6))
                            if re.search('Auth Result',content) == None:  #排除被判别为无效用户的情况
                                state = 1
                                print('成功接入代理IP',proxys_ip[i])
                                break
                        except Exception as e:
                            print(proxys_ip[i],'请求失败',e)
                        except urllib.error.HTTPError as e: 
                            print(proxys_ip[i],'请求失败',e.code)
                        except urllib.error.URLError as e:
                            print(proxys_ip[i],'请求失败',e.reason)
                    try:
                        if i == len(proxys_ip)-1:
                            os.remove('proxy_ip.txt')
                            print('代理IP池失效,已删除')
                    except:        #i不存在的情况
                        os.remove('proxy_ip.txt')
                        print('代理IP池为空,文件已删除')
                        time.sleep(60)
                test += 1
        if test == 5:
            print('未成功获取 %s 页面内容'%url)
            content = None
        return(content)
HtmlDownloader

       四、HTML解析器

       HTML解析器主要对HTML下载器下载的网页内容进行解析,提取想要的内容。

       本文用到的网页解析方法主要是正则表达式和BeautifulSoup,下面是HTML解析器的代码

# coding:utf-8
import re
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
import numpy as np
import time
import datetime
class HtmlParser(object):
    '''解析网页内容'''
    def __init__(self,content):
        self.soup = BeautifulSoup(content,"lxml") #待解析内容
        
    def get_header(self):
        '''获取表格标题'''
        try:
            header = []
            for tag in self.soup.thead.find_all('td'):
                title = str(tag)
                title = title.replace(' ','')
                title = title.replace('\n','')
                header.extend(re.findall('>(.*?)<',title))
            header_name = []
            for data in header:
                if data != '':
                    header_name.append(data.strip())
            header_name.append('数据时间')
        except:     #无标题返回空列表,标记了该内容是否有效
            header_name = []
            return(header_name)
        h2_len = len(self.soup.thead.find_all('td',class_ = "h2"))
        datalist_len = len(self.soup.find_all('tbody',id="datalist") + self.soup.find_all('tbody',id="datalist1") + self.soup.find_all('tbody',id="datalist2"))
        if h2_len >= 6 or datalist_len == 0:    #排除了标题格式不统一和没数据的两种情况
            header_name = []
        return(header_name)

    def get_header2(self):
        '''获取表格标题(标题存在两层)'''
        stati_date = []
        for date in self.soup.thead.find_all('td',class_ = "double align_center"):
            stati_date.extend(re.findall('>(.*?)<',str(date)))
        header_total = self.get_header()
        header_name = header_total[:-5]
        header_name = header_name[:2] + header_total[-5:-1] + header_name[2:]
        if stati_date[0] in header_name:
            header_name.remove(stati_date[0])
        if stati_date[1] in header_name:
            header_name.remove(stati_date[1])
        header_name.append('三四列统计时间')
        header_name.append('五六列统计时间')
        header_name.append('数据时间')
        return(header_name,stati_date)
        
    def get_datatime(self):
        '''获取数据时间'''
        try:
            date = re.findall('数据时间:(.*?)<',str(self.soup.find_all('span',class_ = "fl")))[0][0:10]
        except:            #若不存在,根据系统时间推断
            now_time = time.localtime()
            if time.strftime("%w",now_time) in ['1','2','3','4','5']:
                date = time.strftime("%Y-%m-%d",now_time)
            elif time.strftime("%w",now_time) == '6':
                dt = (datetime.datetime.now() - datetime.timedelta(days = 1))
                date = dt.strftime("%Y-%m-%d")
            else:
                dt = (datetime.datetime.now() - datetime.timedelta(days = 2))
                date = dt.strftime("%Y-%m-%d")
        return(date)
    
    def get_datalist(self):
        '''获取数据内容'''
        if len(self.soup.find_all('tbody',id="datalist")) >= 1:
            soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist")[0]),"lxml")
        elif len(self.soup.find_all('tbody',id="datalist1")) >= 1:
            soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist1")[0]),"lxml")
        else:
            soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist2")[0]),"lxml")
        date = self.get_datatime()
        row = len(soup.tbody.find_all('tr'))
        #初始化正常标题和双重标题时的数组
        if len(self.soup.thead.find_all('td',class_ = "double align_center")) == 0:
            header_name = self.get_header()
            col = len(header_name)
            datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col)
            flag = 1
        else:
            header_name = self.get_header2()[0]
            col = len(header_name)
            datalist = np.array(['']*(row * col),dtype = 'U24').reshape(row,col)
            flag = 2      
        for i in range(row):        #提取数据并写入数组
            detail = re.findall('>(.*?)<',str(soup.find_all('tr')[i]))
            for blank in range(detail.count('')):
                detail.remove("")
            try:
                if flag == 1:
                    detail.append(date)
                    datalist[i] = detail
                elif flag == 2:
                    stati_date = self.get_header2()[1]
                    detail.append(stati_date[0])
                    detail.append(stati_date[1])
                    detail.append(date)
                    datalist[i] = detail              
            except:
                datalist[i][0] = detail[0]
                datalist[i][col-1] = date
        return(datalist,header_name)
    
    def get_dataframe(self):
        '''组合标题和数据数据为数据框并输出'''
        datalist,header_name = self.get_datalist()
        table = pd.DataFrame(datalist ,columns = header_name)
        return(table)
HtmlParser

       五、数据存储器

       数据存储器主要对解析器解析的数据进行存储,存储方式有很多种,本文选用MYSQL数据库进行存储。

       解析器把每一页的股票数据存为了一个数据框,然后通过数据库连接引擎,把数据框的数据直接存入数据库。

       以下是数据存储器的模块的代码

import pymysql
from sqlalchemy import create_engine
import pandas as pd
from firstSpider.HtmlParser import HtmlParser

class DataOutput(object):
    '''把数据存入MYSQL数据库'''
    def __init__(self,engine,table,table_name):
        self.engine = engine         #数据库连接引擎
        self.table = table           #要储存的表
        self.table_name = table_name #表名
    def output(self):
        self.table.to_sql(name = self.table_name,con = self.engine,if_exists = 'append',index = False,index_label = False)
DataOutput

       六、爬虫调度器

       爬虫调度器主要将上述几个模块组合起来,合理的分工,高效完成任务。

       爬虫调度器采用进程池的方式加快了程序执行的效率,下面是爬虫调度器模块的代码

from firstSpider.UrlManager import UrlManager
from firstSpider.HtmlDownloader import HtmlDownloader
from firstSpider.HtmlParser import HtmlParser
from firstSpider.DataOutput import DataOutput
from sqlalchemy import create_engine
import threadpool,time
 
class SpiderMan(object):
    '''爬虫机器人'''
    def __init__(self,engine,table_name):
        self.engine = engine               #数据库连接引擎
        self.table_name = table_name       #表名
        self.manager = UrlManager()        #URL管理器
        self.downloader = HtmlDownloader() #HTML下载器

    def spider(self,url):
        '''单网页爬虫组件'''
        # HTML下载器下载网页
        html = self.downloader.download(url)
        f = open('stock.txt','w')
        f.write(html)
        f.close()
        # HTML解析器抽取网页数据
        parser = HtmlParser(html)
        if len(parser.get_header()) > 0:
            data = parser.get_dataframe()
            # 数据储存器储存文件
            out = DataOutput(self.engine,data,self.table_name)
            out.output()
            print('%s  的数据已存入表  %s'%(url,self.table_name))
        time.sleep(1)
        return(parser.get_datatime())
            
    def crawl(self,urls):
        '''爬取一个栏目连接列表的内容'''
        self.manager.add_new_urls(urls)
        # 判断url管理器中是否有新的url
        pool = threadpool.ThreadPool(10)
        while(self.manager.has_new_url()):
            # 从URL管理器获取新的url
            new_url = self.manager.get_new_url()
            requests = threadpool.makeRequests(self.spider,(new_url,))
            pool.putRequest(requests[0])
        pool.wait()
SpiderMan

       将上述每个模块的代码都新建一个py文件放在firstSpider文件夹下,并运行如下主程序即可获取证券之星全站的股票数据

from firstSpider.get_proxy_ip import proxy_ip
from firstSpider.get_catalog import get_catalog
from firstSpider.get_urls import get_urls
from firstSpider.SpiderMan import SpiderMan
from selenium import webdriver
from sqlalchemy import create_engine
import time

'''根据左侧子导航下载证券之星当天所有数据'''
if __name__ == "__main__":
    print('获取代理IP并验证有效性')
    ip_pool = proxy_ip('http://quote.stockstar.com',8)
    ip_pool.get_ip()
    print('代理IP池建立完毕')
    getcata = get_catalog()
    catalog = getcata.load_catalog()
    start = 0
    end = len(catalog)
    catalog = catalog[start : end]
    print('初始化浏览器')
    browser = webdriver.Chrome()
    engine = create_engine('mysql+pymysql://root:Jwd116875@localhost:3306/scott?charset=utf8')
    for index in range(start,end):
        table_name,url = getcata.index_info(catalog,index)
        stop_url = ['http://quote.stockstar.com/gold/globalcurrency.shtml']  #想过滤掉的网页链接
        if url not in stop_url:
            geturls = get_urls(browser,url)
            urls = geturls.get_urllist()
            print('已获取  %s  的链接列表'%table_name)
            Spider_man = SpiderMan(engine,table_name)
            Spider_man.crawl(urls)
            datatime = Spider_man.spider(urls[0])
            print('%s: %s 栏目 %s 的增量数据爬取完毕'%(index,table_name,datatime))
main

      麻雀虽小五脏俱全,以上是用简单的爬虫框架实现的一次全站内容爬取,在执行速度和程序伪装上还有很大提升空间,希望能够与大家一同交流成长。

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有

上一篇:Python 类的特殊成员介绍

下一篇:python3开发进阶-Django框架的详解