使用urllib&BeautifulSoup爬取盗墓笔记…

2018-06-28 05:38:54来源:博客园 阅读 ()

新老客户大回馈,云服务器低至5折

'''
download_html:接受url,返回html和BeautifulSoup实例
spider接受html返回url和数据
process_data:处理字符串及保存数据
controller:控制,调用
'''
# coding=utf-8

__author__ = 'Leslie'

from urllib.request import urlopen

from bs4 import BeautifulSoup

import re,collections,os


# 接受url,返回html和BeautifulSoup实例

def download_html(url):

html = urlopen(url).read().decode('utf-8') #获取页面数据

soup = BeautifulSoup(html,'lxml') #实例化BeautifulSoup

return html,soup


#spider接受html返回url队列、title队列、数据

def spider(html=False,soup=False):

# 爬取首页的url和title

if not html and soup:
queue_url = collections.deque() # 存储url队列
queue_title = collections.deque() # 存储标题队列

# 定位元素,提取a标签href和title属性

for item in soup.find_all("div", {"class": "box"}):

for Alabel in item.find_all("a"):

queue_url.append(Alabel["href"])

# 处理title字符串中多余的字符

Str1 = Alabel["title"]
Str2 = '_盗墓笔记9在线阅读_盗墓笔记全集'

if Str2 in Str1:
Str1 = Str1.replace(Str2, '')

index = Str1.index(']')
Str1 = Str1[index + 1:].strip()
queue_title.append(Str1)

return queue_url,queue_title

# 爬取文字

if html and soup:

all_p_label = soup.find("div", class_="content-body").find_all("p")

return all_p_label

# 处理字符串及保存数据

def process_data(Data,title):

# 标题名去除不可用字符[\/?:*<>"|]

while '\\' in title:
index = title.index('\\')
title = title[:index] + title[index + 1:]

matchList = re.findall('[/?:*<>"|]*', title)
matchStr = ''.join(matchList) # '?><'

title = list(title)
for j in matchStr:
title.remove(j)

title = ''.join(title)

#保存文件的绝对路径
abspath = os.path.join(os.path.abspath(r'.\daomubiji1'), title)

#去除文字中多余的字符串如: www.setupu.com
CMP = re.compile("(http://)?([a-zA-Z]+\.)+com") # 编译正则表达式对象

for i in Data:
each_string = str(i.string).replace(" ", "").strip()

if each_string != "None":
Match = re.search(CMP, each_string) # 匹配字符串

# 保存文字到txt文件
with open(abspath, 'a', encoding='utf-8') as fp:
if Match != None:
Newstring = each_string[:Match.span()[0]]
fp.write(Newstring + '\n')
else:
fp.write(each_string + '\n')

# 控制,调用
def controller(url):

# 获取要爬取的url队列和文件名标题
html,soup = download_html(url)
queue_url,queue_title = spider(soup=soup)

# 循环爬取url知道队列为空
while url:
url = queue_url.popleft()
title = queue_title.popleft() + '.txt'
print(title,url)

html,soup = download_html(url)
text_data = spider(html,soup)
process_data(text_data,title)

url = r'http://www.seputu.com/'
os.mkdir(os.path.abspath(r'.\daomubiji1'))
controller(url)

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有

上一篇:Python3正则表示式(3)

下一篇:Python开发【第七篇】:文件及文件夹操作