python3爬虫-使用requests爬取起点小说
2019-04-25 06:58:33来源:博客园 阅读 ()
import requests from lxml import etree from urllib import parse import os, time def get_page_html(url): '''向url发送请求''' resoponse = session.get(url, headers=headers, timeout=timeout) try: if resoponse.status_code == 200: return resoponse except Exception: return None def get_next_url(resoponse): '''获取下一页的url链接''' if resoponse: try: selector = etree.HTML(resoponse.text) url = selector.xpath("//a[@id='j_chapterNext']/@href")[0] next_url = parse.urljoin(resoponse.url, url) return next_url except IndexError: return None def xs_content(resoponse): '''获取小说的章节名,内容''' if resoponse: selector = etree.HTML(resoponse.text) title = selector.xpath("//h3[@class='j_chapterName']/text()")[0] content_xpath = selector.xpath( "//div[contains(@class,'read-content') and contains(@class,'j_readContent')]//p/text()") return title, content_xpath def write_to_txt(info_tuple: tuple): if not info_tuple: return path = os.path.join(BASE_PATH, info_tuple[0]) if not os.path.exists(path): with open(path + ".txt", "wt", encoding="utf-8") as f: for line in info_tuple[1]: f.write(line + "\n") f.flush() def run(url): '''启动''' html = get_page_html(url) next_url = get_next_url(html) info_tupe = xs_content(html) if next_url and info_tupe: print("正在写入") write_to_txt(info_tupe) time.sleep(sleep_time) # 延迟发送请求的时间,减少对服务器的压力。 print("正在爬取%s" % info_tupe[0]) print("正在爬取%s" % next_url) run(next_url) if __name__ == '__main__': session = requests.Session() sleep_time = 5 timeout = 5 BASE_PATH = r"D:\图片\LSZJ" # 存放文件的目录 url = "https://read.qidian.com/chapter/8iw8dkb_ZTxrZK4x-CuJuw2/fWJwrOiObhn4p8iEw--PPw2" # 这是斗破苍穹第一章的url 需要爬取的小说的第一章的链接(url) headers = { "Referer": "read.qidian.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } print('开始运行爬虫') run(url)
原文链接:https://www.cnblogs.com/zhuchunyu/p/10765939.html
如有疑问请与原作者联系
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
下一篇:Python虚拟环境包导出
- python3基础之“术语表(2)” 2019-08-13
- python3 之 字符串编码小结(Unicode、utf-8、gbk、gb2312等 2019-08-13
- Python3安装impala 2019-08-13
- 小白如何入门 Python 爬虫? 2019-08-13
- Django项目中使用qq第三方登录。 2019-08-13
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash