首页 > > 网络编程 > 其它 >

最让人喜欢的Python爬虫案例没有之一: 爬取妹子…

2018-06-18 03:21:27来源：未知阅读 ()

曾经有人问我，为什么要学习Python！
我说："因为我想学习爬虫！"
"那你为什么学习爬虫呢？"
"因为可以批量下载很多很多妹子图！"
其实我都是为了学习，都是为了让自己能更好的掌握Python，练手的项目！
Emmmmm....没错，是为了学习

除了Python还能用什么语言写爬虫？

C，C++。高效率，快速，适合通用搜索引擎做全网爬取。缺点，开发慢，写起来又臭又长，例如：天网搜索源代码。
脚本语言：Perl, Python, Java, Ruby。简单，易学，良好的文本处理能方便网页内容的细致提取，但效率往往不高，适合对少量网站的聚焦爬取
C#？（貌似信息管理的人比较喜欢的语言）

那为什么最终选择Python？

我只想说：人生苦短，我用Python！

那怎么爬取美腻的小姐姐照片呢？
其实爬虫不难，主要就那么几个步骤
1、打开网页，获取源码
2、获取图片
3、保存图片地址与下载图片

准备开车！

用到的模块

1 import os
2 import re
3 import requests
4 from bs4 import BeautifulSoup

模块安装

1 pip install requests
2 Pip install bs4

直接上主菜

  1 # -*- coding: utf-8 -*-
  2 import os
  3 import re
  4 import requests
  5 from bs4 import BeautifulSoup
  6 
  7 save_folder = r'./'
  8 domain_name = 'http://www.27270.com/ent/meinvtupian/'
  9 start_url = 'http://www.27270.com/ent/meinvtupian/'
 10 # 'http://699pic.com/tupian/biyeji.html'
 11 # http://www.27270.com/ent/meinvtupian/
 12 
 13 headers = {
 14     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 15     'Accept-Encoding': 'gzip, deflate, sdch',
 16     'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
 17     'Connection':'keep-alive',
 18     'DNT': '1',
 19     'Host': 'www.kongjie.com',
 20     'Referer': 'http://www.kongjie.com/home.php?mod=space&do=album&view=all&order=hot&page=1',
 21     'Upgrade-Insecure-Requests': '1',
 22     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 23 }
 24 uid_picid_pattern = re.compile(r'.*?uid=(\d+).*?picid=(\d+).*?')
 25 
 26 
 27 
 28 def save_img(image_url, uid, picid):
 29     """
 30     保存图片到全局变量save_folder文件夹下，图片名字为“uid_picid.ext”。
 31     其中，uid是用户id，picid是空姐网图片id，ext是图片的扩展名。
 32     Python学习交流群：125240963，群内每天分享干货，包括最新的python企业案例学习资料和零基础入门教程，欢迎各位小伙伴入群学习交流
 33     """
 34     try:
 35         response = requests.get(image_url, stream=True)
 36         # 获取文件扩展名
 37         file_name_prefix, file_name_ext = os.path.splitext(image_url)
 38         save_path = os.path.join(save_folder, uid + '_' + picid + file_name_ext)
 39         with open(save_path, 'wb') as fw:
 40             fw.write(response.content)
 41         print(uid + '_' + picid + file_name_ext, 'image saved!', image_url)
 42     except IOError as e:
 43         print('save error！', e,"111", image_url,"222")
 44 
 45 
 46 def save_images_in_album(album_url, count):
 47     """
 48     进入空姐网用户的相册，开始一张一张的保存相册中的图片。
 49     """
 50     # 解析出uid和picid，用于存储图片的名字
 51     response = requests.get(album_url)
 52     soup = BeautifulSoup(response.text, 'lxml')
 53     image_div = soup.select('.articleV4Body img')
 54 
 55     for image in image_div:
 56         print(image.attrs['src'])
 57         try:
 58             response = requests.get(image.attrs['src'])
 59             save_path = os.path.join(save_folder, str(count) + '.jpg')
 60             with open(save_path, 'wb') as fw:
 61                 fw.write(response.content)
 62         except IOError as e:
 63             print('save error！', e, "222")
 64 
 65 
 66 
 67 
 68     # next_image = soup.select_one('div.pns.mlnv.vm.mtm.cl a.btn[title="下一张"]')
 69     # if not next_image:
 70     #     return
 71     # # 解析下一张图片的picid，防止重复爬取图片，不重复则抓取
 72     # next_image_url = next_image['href']
 73     # next_uid_picid_match = uid_picid_pattern.search(next_image_url)
 74     # if not next_uid_picid_match:
 75     #     return
 76     # next_uid = next_uid_picid_match.group(1)
 77     # next_picid = next_uid_picid_match.group(2)
 78     # # if not redis_con.hexists('kongjie', next_uid + ':' + next_picid):
 79     # save_images_in_album(next_image_url)
 80 
 81 
 82 def parse_album_url(url):
 83     """
 84     解析出相册url，然后进入相册爬取图片
 85     """
 86     response = requests.get(url)
 87     soup = BeautifulSoup(response.text, 'lxml')
 88     people_list = soup.select('li a.tit')
 89     count = 0
 90     for people in people_list:
 91         save_images_in_album(people.attrs['href'], count)
 92         count = count + 1
 93         # break
 94 
 95     # # 爬取下一页
 96     # next_page = soup.select_one('a.nxt')
 97     # if next_page:
 98     #     parse_album_url(next_page['href'])
 99 
100 if __name__ == '__main__':
101     parse_album_url(start_url)