Python-爬虫小计
2019-01-21 02:42:55来源:博客园 阅读 ()
# -*-coding:utf8-*-
import requests
from bs4 import BeautifulSoup
import time
import os
import urllib
import re
import json
requests.packages.urllib3.disable_warnings()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
def get_bs(url):
res = requests.get(url, headers=headers, verify=False)
bs = BeautifulSoup(res.content, 'lxml')
return bs
def get_first_url():
first_url_list = []
page = 1
for i in range(page):
root_url = "https://www.model61.com/mold.php?page={}".format(str(i+1))
bs = get_bs(root_url)
for i in bs.select("dt a"):
src = i.get('href')
if "php" in src:
first_url = "https://www.model61.com/{}".format(src)
first_url_list.append(first_url)
return first_url_list
def get_second_url(first_url):
data = {}
bs = get_bs(first_url)
for i in bs.select(".cont-top a"):
src = i.get('href')
if "album_s" in src:
second_url = "https://www.model61.com/{}".format(src)
#print("second_url",second_url)
data["second_url"] = second_url
for j in bs.select(".content_center_date"):
data["identity"] = j.get_text()
return data
def get_thred_url(second_url):
bs = get_bs(second_url)
for i in bs.select("dt a"):
src = i.get('href')
if "album_list" in src:
thred_url = "https://www.model61.com/{}".format(src)
#print("thred_url", thred_url)
return thred_url
def get_image_list(thred_url):
image_list = []
bs = get_bs(thred_url)
for i in bs.select(".album_list_left a")+bs.select(".album_list_right a"):
src = i.get('href')
image_path = "https://www.model61.com/{}".format(src)
image_list.append(image_path)
#print("image_path",image_path)
return image_list
def download_image(image_path):
pass
if __name__ == '__main__':
first_url_list = get_first_url()
for first_url in first_url_list:
data = get_second_url(first_url)
second_url = data['second_url']
thred_url = get_thred_url(second_url)
image_list = get_image_list(thred_url)
data["image_list"] = image_list
print(data)
原文链接:https://www.cnblogs.com/buscar/p/10288718.html
如有疑问请与原作者联系
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
上一篇:python_文件操作
- 小白如何入门 Python 爬虫? 2019-08-13
- python day2-爬虫实现github登录 2019-08-13
- python爬虫学习之爬取超清唯美壁纸 2019-08-13
- python爬虫学习之用Python抢火车票的简单小程序 2019-08-13
- python爬虫常用库 2019-08-13
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash