python3爬虫-通过requests爬取西刺代理
2019-05-04 09:42:22来源:博客园 阅读 ()
import requests from fake_useragent import UserAgent from lxml import etree from urllib.parse import urljoin import pymysql import time ua = UserAgent() class MyException(Exception): def __init__(self, status, msg): self.status = status self.msg = msg super().__init__() class XiCi: def __init__(self): self.session = requests.Session() self.session.headers = { "User-Agent": ua.random, "Host": "www.xicidaili.com" } self.conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", db="proxies") self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor) def get_page_html(self, api): '''通过get方法请求网页''' response = self.session.get(url=api, headers=self.session.headers) if response.status_code == 200: return response def __html_to_etree(self, html): '''将html源码转为xml''' return etree.HTML(html) def get_next_page_url(self, response): '''拿到下一页的url''' selector = self.__html_to_etree(response.text) try: next_page_url = selector.xpath("//a[@class='next_page']/@href")[0] next_page_url = urljoin(response.url, next_page_url) return next_page_url except IndexError: raise MyException(1000, "爬取完毕") def __get_proxies_info(self, response): '''获取到爬取的代理信息''' selector = self.__html_to_etree(response.text) tr_ele_list = selector.xpath("//*[@id='ip_list']//tr") for tr in tr_ele_list: ip = tr.xpath("td[2]/text()") if not ip: continue ip = ip[0] port = tr.xpath("td[3]/text()")[0] type = tr.xpath("td[6]/text()")[0] yield [ip, port, type] def __detect_availability(self, data): '''拿到爬取的数据,检测代理是否可以使用''' https_api = "https://icanhazip.com/" http_api = "http://icanhazip.com/" ip = data[0] port = data[1] type = data[2] proxies = {type.lower(): "{}://{}:{}".format(type.lower(), ip, port)} try: if type.upper() == "HTTPS": requests.get(https_api, headers={"User-Agent": ua.random}, proxies=proxies, timeout=3) else: requests.get(http_api, headers={"User-Agent": ua.random}, proxies=proxies, timeout=3) return True except Exception: return False def get_usable_proxies_ip(self, response): '''获取到可用的代理ip''' res = self.__get_proxies_info(response) for data in res: if self.__detect_availability(data): self.save_to_db(data) def save_to_db(self, data): '''保存到数据库''' sql = 'insert into proxies_table(ip,port,type) values(%s,%s,%s);' print(data) self.cursor.execute(sql, data) self.conn.commit() def run(self, api): '''启动入口''' page = 1 while True: print("爬取第{}页数据...".format(page)) response = self.get_page_html(api) self.get_usable_proxies_ip(response) try: api = self.get_next_page_url(response) except MyException as e: if e.status == 1000: print(e.msg) break page += 1 time.sleep(3) def __del__(self): self.conn.close() if __name__ == '__main__': api = "https://www.xicidaili.com/nn" xici = XiCi() xici.run(api)
原文链接:https://www.cnblogs.com/zhuchunyu/p/10808073.html
如有疑问请与原作者联系
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
- python3基础之“术语表(2)” 2019-08-13
- python3 之 字符串编码小结(Unicode、utf-8、gbk、gb2312等 2019-08-13
- Python3安装impala 2019-08-13
- 小白如何入门 Python 爬虫? 2019-08-13
- python day2-爬虫实现github登录 2019-08-13
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash