python爬虫从小白到高手 Day2 动态页面的爬取
2018-06-23 13:17:21来源:未知 阅读 ()
import requests import re import json from hashlib import md5 import os from bs4 import BeautifulSoup import pymongo from config import * import time client = pymongo.MongoClient(MONGO_URL, connect=False) db = client[MONGO_DB] def get_page(offset): params = { 'offset': offset, 'format': 'json', 'keyword': '街拍', 'autoload': 'true', 'count': '20', 'cur_tab': '1', 'from': 'search_tab', } url = 'https://www.toutiao.com/search_content/?' try: response = requests.get(url, params=params) if response.status_code == 200: return response.json() except requests.ConnectionError: return None def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('save success', result) return True return False def download_real_image(url): print('downloading---', url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content) return None except RequestException: print('request image fail---', url) return None def save_image(content): files_path = '{0}/{1}'.format(os.getcwd(), 'tupian') if not os.path.exists(files_path): os.mkdir(files_path) file_path = '{0}/{1}.{2}'.format(files_path, md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) def get_real_image_path(article_url): headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'} response = requests.get(article_url, headers=headers) soup = BeautifulSoup(response.text, "lxml") title = soup.select('title')[0].get_text() image_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S) result = re.search(image_pattern, response.text) if result: result = result.group(1).replace('\\', '') data = json.loads(result) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images_urls = [item.get('url') for item in sub_images] for image_url in images_urls: download_real_image(image_url) return { 'title': title, 'url' : article_url, 'image_urls': images_urls } def download_image(jsonData): if jsonData.get('data'): for item in jsonData.get('data'): if item and 'article_url' in item.keys(): title = item.get('title') article_url = item.get('article_url') result = get_real_image_path(article_url) if result: save_to_mongo(result) ''' 另外一种数据格式cell,cell type太多,主要分析上面一种 else: #original_page_url data = item.get('display') #print(display) #data = json.loads(display) #print(data) if data and 'results' in data.keys(): results = data.get('results') original_page_urls = [item.get('original_page_url') for item in results] # .get('results').get('original_page_url') #title = item.get('display').get('title') #print(title) #print(original_page_urls)''' def main(): STARTPAGE = 1 ENDPAGE = 2 for i in range(STARTPAGE, ENDPAGE): time.sleep(1) offset = i * 20 jsonData = get_page(offset) download_image(jsonData) if __name__ == "__main__": main()
config.py
MONGO_URL = 'localhost' MONGO_DB = 'jiepai' MONGO_TABLE = 'jiepai' GROUP_START = 0 GROUP_END = 20 KEYWORD = '街拍'
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
- python3基础之“术语表(2)” 2019-08-13
- python3 之 字符串编码小结(Unicode、utf-8、gbk、gb2312等 2019-08-13
- Python3安装impala 2019-08-13
- 小白如何入门 Python 爬虫? 2019-08-13
- python_字符串方法 2019-08-13
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash