首页 > > 网络编程 > 其它 >

Python抓取京东商品详情页数据到Excel

2018-06-18 03:17:16来源：未知阅读 ()

Python路上，你我同行！

模块

1 import  requests
2 from bs4 import  BeautifulSoup
3 import openpyxl
4 import  time
5 import  re
6 import json

搜索商品列表页的每个商品的链接

 1 def make_a_link(keyword,page):
 2     url = 'https://search.jd.com/Search?keyword=' + keyword + '&enc=utf-8&page=' + str(page * 2 - 1)
 3     res = requests.get(url)
 4     res.raise_for_status()
 5     res.encoding = res.apparent_encoding  # 转码 为防止出现乱码
 6     print('正在爬取第' + str(page) + '页:' + url)
 7     html = res.text
 8     soup = BeautifulSoup(html, 'lxml')
 9     links = soup.find_all('li', class_='gl-item')  # 所有商品的li
10     return (link for link in links)

详情页

1     try:
2         r = requests.get(purl)
3         r.raise_for_status
4         r.encoding = 'gbk'
5         return r.text
6     except:
7         print('此页无法链接！！！')
8         return ''

商品的名字和价格

1 def get_name_price(uid,row,sheet):
2     content = detail_link('https://c.3.cn/recommend?&methods=accessories&sku=' + uid + '&cat=9987%2C653%2C655')
3     try:
4         jd=json.loads(content)
5         sheet.cell(row=row,column=2)._value=jd['accessories']['data']['wName']
6         sheet.cell(row=row,column=3)._value=jd['accessories']['data']['wMaprice']
7         print(jd['accessories']['data']['wName'])
8     except:
9         return ""

店铺

 1 def get_shop(uid,row,sheet):
 2     content = detail_link('https://chat1.jd.com/api/checkChat?pid=' + uid + '&returnCharset=utf-8')
 3     try:
 4         jd = json.loads(content.lstrip('null(').rstrip(');'))
 5         try:
 6             sheet.cell(row=row, column=4)._value = jd['seller']
 7         except:
 8             return ''
 9     except:
10         ''

商品的评论

 1 def get_comments(uid,row,sheet):
 2     content = detail_link('https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + uid)
 3     jd=json.loads(content)
 4     sheet.cell(row=row,column=6)._value=jd['CommentsCount'][0]['CommentCountStr'] #总评
 5     sheet.cell(row=row,column=7)._value=jd['CommentsCount'][0]['GoodCountStr'] #好评
 6     sheet.cell(row=row,column=8)._value=jd['CommentsCount'][0]['GoodRate'] #好评率
 7 def main():
 8     wb =openpyxl.Workbook()
 9     sheet=wb.active
10     sheet.title = "京东抓取商品数据.xlsx"
11     sheet.cell(row=1,column=1)._value='商品ID'
12     sheet.cell(row=1,column=2)._value='商品名称'
13     sheet.cell(row=1,column=3)._value= '价格'
14     sheet.cell(row=1,column=4)._value='店铺'
15     sheet.cell(row=1,column=5)._value= '链接'
16     sheet.cell(row=1,column=6)._value='评论数'
17     sheet.cell(row=1,column=7)._value='好评数'
18     sheet.cell(row=1,column=8)._value= '评论率'
19     row=2
20     keyword=input("请输入要抓取的商品：")
21     pages=input("要抓取的页数：")   #str类型
22     starttime=time.time()
23     pages=int(pages)
24     for page in range(1,pages+1):
25         for link in make_a_link(keyword, page):
26             uid=link['data-sku']
27             # uid = re.match(r'.+?(\d+).+', purl).group(1)  # 商品的ID
28             sheet.cell(row=row, column=1)._value =uid
29             purl=link.find('div', class_='p-name p-name-type-2').a['href']
30             if 'http' not in purl:
31                 sheet.cell(row=row, column=5)._value = 'http:' + purl
32             else:
33                 sheet.cell(row=row, column=5)._value = purl
34             get_name_price(uid, row, sheet)
35             get_shop(uid, row, sheet)
36             get_comments(uid, row, sheet)
37             row+=1
38     wb.save('京东抓取'+keyword+'数据.xlsx')
39     print('耗时{}秒。'.format(time.time() - starttime))  # 爬取所需时间
40 if __name__ == '__main__':
41     main()