首页 > > 网络编程 > 其它 >

使用urllib&BeautifulSoup爬取盗墓笔记…

2018-06-28 05:38:54来源：博客园阅读 ()

'''
download_html:接受url，返回html和BeautifulSoup实例
spider接受html返回url和数据
process_data:处理字符串及保存数据
controller:控制，调用
'''

# coding=utf-8

__author__ = 'Leslie'


from urllib.request import urlopen

from bs4 import BeautifulSoup

import re,collections,os


# 接受url，返回html和BeautifulSoup实例

def download_html(url):

    html = urlopen(url).read().decode('utf-8')  #获取页面数据

    soup = BeautifulSoup(html,'lxml')   #实例化BeautifulSoup

    return html,soup


#spider接受html返回url队列、title队列、数据

def spider(html=False,soup=False):

    # 爬取首页的url和title

    if not html and soup:
        queue_url = collections.deque()  # 存储url队列
        queue_title = collections.deque()  # 存储标题队列

        # 定位元素，提取a标签href和title属性

        for item in soup.find_all("div", {"class": "box"}):

            for Alabel in item.find_all("a"):

                queue_url.append(Alabel["href"])

                # 处理title字符串中多余的字符

                Str1 = Alabel["title"]
                Str2 = '_盗墓笔记9在线阅读_盗墓笔记全集'

                if Str2 in Str1:
                    Str1 = Str1.replace(Str2, '')

                index = Str1.index(']')
                Str1 = Str1[index + 1:].strip()
                queue_title.append(Str1)

        return queue_url,queue_title

    # 爬取文字

    if html and soup:

        all_p_label = soup.find("div", class_="content-body").find_all("p")

        return all_p_label

# 处理字符串及保存数据

def process_data(Data,title):

    # 标题名去除不可用字符[\/?:*<>"|]

    while '\\' in title:
        index = title.index('\\')
        title = title[:index] + title[index + 1:]

    matchList = re.findall('[/?:*<>"|]*', title)
    matchStr = ''.join(matchList)  # '?><'

    title = list(title)
    for j in matchStr:
        title.remove(j)

    title = ''.join(title)

    #保存文件的绝对路径
    abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)

    #去除文字中多余的字符串如： www.setupu.com
    CMP = re.compile("(http://)?([a-zA-Z]+\.)+com")  # 编译正则表达式对象

    for i in Data:
        each_string = str(i.string).replace(" ", "").strip()

        if each_string != "None":
            Match = re.search(CMP, each_string)  # 匹配字符串

            # 保存文字到txt文件
            with open(abspath, 'a', encoding='utf-8') as fp:
                if Match != None:
                    Newstring = each_string[:Match.span()[0]]
                    fp.write(Newstring + '\n')
                else:
                    fp.write(each_string + '\n')

# 控制，调用
def controller(url):

    # 获取要爬取的url队列和文件名标题
    html,soup = download_html(url)
    queue_url,queue_title = spider(soup=soup)

    # 循环爬取url知道队列为空
    while url:
        url = queue_url.popleft()
        title = queue_title.popleft() + '.txt'
        print(title,url)

        html,soup = download_html(url)
        text_data = spider(html,soup)
        process_data(text_data,title)

url = r'http://www.seputu.com/'
os.mkdir(os.path.abspath(r'.\daomubiji1'))
controller(url)