首页 > > 网络编程 > 其它 >

使用python-aiohttp爬取今日头条

2018-06-18 00:53:41来源：未知阅读 ()

http://blog.csdn.net/u011475134/article/details/70198533 原出处

在上一篇文章《使用python-aiohttp爬取网易云音乐》中，我们给自己的微信公众号添加了在线点歌的功能，这次我们再增加一个新闻浏览的功能。由于我平时浏览新闻用的是今日头条，所以在这里就想通过爬取今日头条来获取新闻。不过遗憾的是，这一次我在网上没有找到满意的方法，所以还是自己动手吧。

打开抓包软件Fiddler并设置Filters。
这里写图片描述

打开今日头条网页，选择热点。
url：http://www.toutiao.com/ch/news_hot/
这里写图片描述

在Fiddler中找到与上面新闻对应的json数据与url，由于json数据太多，这里只给出部分。

{
    "has_more": false,
    "message": "success",
    "data": [
        {
            "chinese_tag": "体育",
            "media_avatar_url": "http://p3.pstatp.com/large/3538/9145332",
            "is_feed_ad": false,
            "tag_url": "news_sports",
            "title": "中国足协想搬出北京五环，原来是相中了这块风水宝地",
            "single_mode": true,
            "middle_mode": true,
            "abstract": "中国足协搬家的传说，很可能将水落石出。而且，此前的传说其实很靠谱，中国足协确实有意要从目前位于北京东城区夕照寺街的东玖大厦，搬到京城五环路以外。南海子虽然在北京五环外，但是紧邻南五环，距离位于体育馆路的国家体育总局也不过19公里。",
            "tag": "news_sports",
            "behot_time": 1492391171,
            "source_url": "/group/6409606379224957186/",
            "source": "长安街知事",
            "more_mode": false,
            "article_genre": "article",
            "image_url": "http://p3.pstatp.com/list/190x124/1bf4000b11da52a33c32",
            "has_gallery": false,
            "group_source": 2,
            "comments_count": 28,
            "group_id": "6409606379224957186",
            "media_url": "/c/user/4327876576/"
        },{},{},{},{},{},{}
    ],
    "next": {
        "max_behot_time": 1492391156
    }
}

可以看到，在json数据中的data是一个包含新闻的列表，其中的title是新闻的标题，abstract是新闻的摘要，source_url是新闻的链接，image_url是新闻图片的链接，这些都是我们需要的。

http://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A135888F14A1507&cp=58F4A1A500177E1

参数	取值	说明
category	news_hot	类型，定值
utm_source	toutiao	定值
widen	1	定值
max_behot_time	0	偏移量（默认为0）
max_behot_time_tmp	0	与max_behot_time相等
tadrequire	true	定值
as	A135888F14A1507	未知
cp	58F4A1A500177E1	未知

通过多次观察发现，max_behot_time类似偏移量，点击热点时，取值为零，下拉网页时，取值为上一个josn数据中的next[max_behot_time]，由于点击热点就可以刷新新闻，所以让max_behot_time等于固定值0就好。

as和cp每次都会改变，但没有找到规律，推测应该是每次请求时，按照一定规律生成的数据，于是查看网页源码，很明显，下面这段代码就是用来产生as和cp的，从代码中可以看到，as和cp相当于一个时间戳，我们可以仿照这段代码用python来生成as和cp。

e.getHoney = function() {
    var t = Math.floor((new Date).getTime() / 1e3),
    e = t.toString(16).toUpperCase(),
    i = md5(t).toString().toUpperCase();
    if (8 != e.length) return {
        as: "479BB4B7254C150",
        cp: "7E0AC8874BB0985"
    };
    for (var n = i.slice(0, 5), a = i.slice( - 5), s = "", o = 0; 5 > o; o++) s += n[o] + e[o];
    for (var r = "",
    c = 0; 5 > c; c++) r += e[c + 3] + a[c];
    return {
        as: "A1" + s + e.slice( - 3),
        cp: e.slice(0, 3) + r + "E1"
    }
}

新建文件toutiao3.py，代码如下：

  1 import asyncio
  2 from aiohttp import ClientSession
  3 import time
  4 import math
  5 import hashlib
  6 
  7 __NEWS_NUM = 1  # hu 返回的最大新闻数
  8 
  9 def getASCP():
 10     t = int(math.floor(time.time()))
 11     e = hex(t).upper()[2:]
 12     m = hashlib.md5()
 13     m.update(str(t).encode(encoding='utf-8'))
 14     i = m.hexdigest().upper()
 15 
 16     if len(e) != 8:
 17         AS = '479BB4B7254C150'
 18         CP = '7E0AC8874BB0985'
 19         return AS,CP
 20 
 21     n = i[0:5]
 22     a = i[-5:]
 23     s = ''
 24     r = ''
 25     for o in range(5):
 26         s += n[o] + e[o]
 27         r += e[o + 3] + a[o]
 28 
 29     AS = 'A1' + s + e[-3:]
 30     CP = e[0:3] + r + 'E1'
 31     return AS,CP
 32 
 33 async def __fetch(url,data,loop):
 34     try:
 35         async with ClientSession(loop=loop) as session:
 36             # hu 发送GET请求，params为GET请求参数，字典类型
 37             async with session.get(url, params=data,timeout=5) as response:
 38                 # hu 以json格式读取响应的body并返回字典类型
 39                 return await response.json()
 40     except Exception as ex:
 41         print('__fetch:%s' % ex)
 42 
 43 async def getNewsInfo(loop):
 44     global __NEWS_NUM
 45     AS,CP = getASCP()
 46     urlTouTiao = 'http://www.toutiao.com'
 47     urlNews = 'http://www.toutiao.com/api/pc/feed/'
 48     dataNew = {'category': 'news_hot',
 49                'utm_source': 'toutiao',
 50                'widen': '1',
 51                'max_behot_time': '0',
 52                'max_behot_time_tmp':'0',
 53                'tadrequire':'true',
 54                'as':AS,
 55                'cp':CP}
 56     result = None
 57     try:
 58         task = asyncio.ensure_future(__fetch(urlNews, dataNew,loop),loop=loop)
 59         taskDone = await asyncio.wait_for(task,timeout=5)
 60         if 'message' not in taskDone or taskDone['message'] != 'success':
 61             return result
 62 
 63         result = {'max_behot_time':taskDone['next']['max_behot_time'],
 64                   'news':[]}
 65 
 66         for news_hot in taskDone['data']:
 67             news = {'Title': None,
 68                     'Description': None,
 69                     'PicUrl': None,
 70                     'Url': None}
 71             # hu 去掉广告
 72             if news_hot['is_feed_ad']:
 73                 continue
 74             news['Title'] = news_hot['title']
 75             if 'abstract' in news_hot:
 76                 news['Description'] = news_hot['abstract']
 77             else:
 78                 news['Description'] = ''
 79             if 'image_url' in news_hot:
 80                 news['PicUrl'] = news_hot['image_url']
 81             else:
 82                 news['PicUrl'] = ''
 83             news['Url'] = urlTouTiao + news_hot['source_url']
 84             result['news'].append(news)
 85             if len(result['news']) == __NEWS_NUM:
 86                 break
 87         # hu 把有图片的新闻放到前面
 88         result['news'].sort(key=lambda obj: obj.get('PicUrl'), reverse=True)
 89     except Exception as ex:
 90         print('getNewsInfo:%s' % ex)
 91     return result
 92 
 93 def __main():
 94     loop = asyncio.get_event_loop()
 95     for ii in range(2):
 96         task = asyncio.ensure_future(getNewsInfo(loop),loop=loop)
 97         taskDone = loop.run_until_complete(task)
 98         print('第%d次：' % ii)
 99         for res in taskDone['news']:
100             print(res)
101     loop.close()
102 
103 if __name__ == '__main__':
104     __main()

返回值：

第0次：
{'Title': '长沙现微型古籍 疑是明清科举作弊用书', 'Description': '', 'PicUrl': 'http://p3.pstatp.com/list/190x124/1bbf000b36c2b4f07ca0', 'Url': 'http://www.toutiao.com/group/6409737414201721090/'}
第1次：
{'Title': '长沙现微型古籍 疑是明清科举作弊用书', 'Description': '', 'PicUrl': 'http://p3.pstatp.com/list/190x124/1bbf000b36c2b4f07ca0', 'Url': 'http://www.toutiao.com/group/6409737414201721090/'}

结果每次获取的新闻都是一样的，通过Fiddler多次观察，发现新闻的刷新和Cookie中的tt_webid有关，当该参数不存在时，则返回默认新闻与tt_webid，所以我们只要取出响应中的tt_webid，并在发送请求时给tt_webid赋值就好。

将toutiao3.py文件做如下修改：

 1 #### hu 增加代码 ######
 2 __cookie = None
 3 ######################
 4 
 5 async def __fetch(url,data,loop):
 6     global __cookie
 7     try:
 8         async with ClientSession(cookies=__cookie,loop=loop) as session:
 9             async with session.get(url, params=data,timeout=5) as response:
10 ########################### hu 增加代码 ##############################
11                 if response.cookies and 'tt_webid' in response.cookies:
12                     __cookie = {'tt_webid':response.cookies['tt_webid'].value}
13 #####################################################################
14                 return await response.json()
15     except Exception as ex:
16         print('__fetch:%s' % ex)

返回值：

第0次：
{'Title': '长沙现微型古籍 疑是明清科举作弊用书', 'Description': '', 'PicUrl': 'http://p3.pstatp.com/list/190x124/1bbf000b36c2b4f07ca0', 'Url': 'http://www.toutiao.com/group/6409737414201721090/'}
第1次：
{'Title': '陈羽凡设立投资13家公司 3家被吊销营业执照', 'Description': '[海峡都市报-海峡网]（原标题：陈羽凡商业版图冰火两极）4月16日，歌手陈羽凡率先发声，通过视频正式澄清已与白百何协议离婚，并表示将无限期退出娱乐圈。一时之间，业内外开始对陈羽凡退出娱乐圈以后将何去何从进行诸多揣测。', 'PicUrl': 'http://p1.pstatp.com/list/190x124/1bbd000c3eef588ea140', 'Url': 'http://www.toutiao.com/group/6409610618823590146/'}