Python爬虫之Urllib库的基本使用
2018-11-27 08:31:08来源:博客园 阅读 ()
# get请求 import urllib.request response = urllib.request.urlopen("http://www.baidu.com") print(response.read().decode('utf-8')) # post请求 import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding='utf8') response = urllib.request.urlopen('http://httpbin.org/post', data=data) print(response.read()) import urllib.request response = urllib.request.urlopen('http://httpbin.org/get', timeout=1) print(response.read()) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout = 0.1) except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): print('TIME OUT') # 响应类型 import urllib.request response = urllib.request.urlopen('http://www.python.org') print(type(response)) # 状态码、响应头 import urllib.request response = urllib.request.urlopen('http://www.python.org') print(response.status) print(response.getheaders()) print(response.getheader('server')) # Request import urllib.request request = urllib.request.Request('http://python.org') response = urllib.request.urlopen(request) print(response.read().decode('utf-8')) from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent': 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36', 'Host':'httpbin.org' } dict = { 'name':'Germey' } data = bytes(parse.urlencode(dict), encoding = 'utf-8') req = request.Request(url = url, data = data, headers = headers, method = 'POST') response = request.urlopen(req) print(response.read().decode('utf-8')) from urllib import request, parse url = 'http://httpbin.org/post' dict = { 'name': 'Germey' } data = bytes(parse.urlencode(dict), encoding = 'utf-8') req = request.Request(url = url, data = data, method = 'POST') req.add_header('User-Agent', 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36') response = request.urlopen(req) print(response.read().decode('utf-8')) #代理 import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9743' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbon.org/get') print(response.read()) # cookie import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') for item in cookie: print(item.name + " = " + item.value) # 保存cookie为1.txt import http.cookiejar, urllib.request filename = '1.txt' cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard = True, ignore_expires = True) # 另外一种方式保存cookie import http.cookiejar, urllib.request filename = '1.txt' cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard = True, ignore_expires = True) # 读取cookie import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load('1.txt', ignore_discard = True, ignore_expires = True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8')) # 异常处理 from urllib import request, error try: response = request.urlopen('http://lidonghao.com') except error.URLError as e: print(e.reason) from urllib import request, error try: response = request.urlopen('http://www.baidu.com/101') except error.HTTPError as e: print(e.reason, e.code, sep = '\n') except error.URLError as e: print(e.reason) else: print('Request Successfully') import socket import urllib.request import urllib.error try: response = urllib.request.urlopen("https://www.baidu.com", timeout = 0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print("TIME OUT")
1 # 解析URL 2 # urlparse 3 from urllib.parse import urlparse 4 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') 5 print(type(result), result) 6 7 from urllib.parse import urlparse 8 result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme = "https") 9 print(result) 10 11 from urllib.parse import urlparse 12 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme = "https") 13 print(result) 14 15 from urllib.parse import urlparse 16 result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments = False) 17 print(result) 18 19 from urllib.parse import urlparse 20 result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments = False) 21 print(result)
1 # urlunparse 2 from urllib.parse import urlunparse 3 data = ['http', 'www.baidu.com', 'index,html', 'user', 'a=6', 'comment'] 4 print(urlunparse(data)) 5 6 # urljoin 7 from urllib.parse import urljoin 8 print(urljoin('http://www.baidu.com', 'FAQ.html')) 9 print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) 10 print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html')) 11 print(urljoin('http://www.baidu.com/about.html', 'http://cuiqingcai.com/FAQ.html?question=2')) 12 print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php')) 13 print(urljoin('http://www.baidu.com', '?category=2#comment')) 14 print(urljoin('www.baidu.com', '?category=2#comment')) 15 print(urljoin('www.baidu.com#comment', '?category=2')) 16 17 # urlencode 18 from urllib.parse import urlencode 19 params = { 20 'name':'germey', 21 'age':22 22 } 23 base_url = 'http://www.baidu.com' 24 url = base_url + urlencode(params) 25 print(url)
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
上一篇:(项目)生鲜超市(六)
下一篇:socket模块
- python3基础之“术语表(2)” 2019-08-13
- python3 之 字符串编码小结(Unicode、utf-8、gbk、gb2312等 2019-08-13
- Python3安装impala 2019-08-13
- 小白如何入门 Python 爬虫? 2019-08-13
- python_字符串方法 2019-08-13
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash