爬取腾讯招聘网信息
2019-04-11 10:16:34来源:博客园 阅读 ()
import requests from bs4 import BeautifulSoup from math import ceil header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} # 获取岗位页数 def getJobPage(url): ret = requests.get(url, headers=header) ret.encoding = "utf-8" # 解决乱码问题 html = ret.text soup = BeautifulSoup(html, 'html.parser') # 获取岗位总数,< span class ="lightblue total" > 512 < / span > totalJob = soup.select('span[class="lightblue total"]')[0].text jobPage = ceil(int(totalJob) / 10) return jobPage def getJobOrder(url): ret = requests.get(url, headers=header) ret.encoding = "utf-8" # 解决乱码问题 html = ret.text soup = BeautifulSoup(html, 'html.parser') # 工作职责 jobRequests = soup.select('ul[class="squareli"]')[0].text # 工作要求 jobOrder = soup.select('ul[class="squareli"]')[1].text return jobRequests, jobOrder # 获取岗位信息 def getJobInfo(url): myfile = open("tencent_job.txt", "a", encoding='gb18030', errors='ignore') # 解决乱码问题 ret = requests.get(url, headers=header) ret.encoding = "utf-8" # 解决乱码问题 html = ret.text soup = BeautifulSoup(html, 'html.parser') jobList = soup.find_all('tr', class_=['even', 'odd']) for job in jobList: # url jobUrl = "https://hr.tencent.com/" + job.select('td:nth-of-type(1) > a')[0]['href'] # 职位名称 jobName = job.select('td:nth-of-type(1) > a')[0].text # 人数 jobPeople = job.select('td:nth-of-type(3)')[0].text # 地点 jobAddre = job.select('td:nth-of-type(4)')[0].text # 发布时间 jobTime = job.select('td:nth-of-type(5)')[0].text # 工作职责 jobRequests = getJobOrder(jobUrl)[0] # 工作要求 jobOrder = getJobOrder(jobUrl)[1] #print(jobName, jobUrl, jobAddre, jobPeople, jobTime, jobRequests, jobOrder) tt = jobName + " " + jobUrl + " " + jobAddre + " " + jobPeople + " " + jobTime + " " + jobRequests + " " + jobOrder myfile.write(tt + "\n") if __name__ == '__main__': mainurl = 'https://hr.tencent.com/position.php?keywords=python' jobPage = getJobPage(mainurl) print(jobPage) for page in range(jobPage): pageUrl = 'https://hr.tencent.com/position.php?keywords=python&start=' + str(page * 10) + '#a' print("第" + str(page + 1) + "页") getJobInfo(pageUrl)
# -*- coding:utf-8 -*- import requests, json, time from bs4 import BeautifulSoup class tencent_hr(object): def __init__(self): self.base_url = "http://hr.tencent.com/position.php?" self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} self.item_list = [] self.page = 0 # 发送请求 def send_request(self, url, params={}): time.sleep(2) try: response = requests.get(url, params=params, headers=self.headers) return response.content except Exception as e: print e # 解析数据 def parse_data(self, data): # 初始化 bs = BeautifulSoup(data, 'lxml') # 获取标签-结果为列表 data_list = bs.select('.even, .odd') # 将结果中的每一行数据提取出来 for data in data_list: data_dict = {} data_dict['work_name'] = data.select('td a')[0].get_text() data_dict['work_type'] = data.select('td')[1].get_text() data_dict['work_count'] = data.select('td')[2].get_text() data_dict['work_place'] = data.select('td')[3].get_text() data_dict['work_time'] = data.select('td')[4].get_text() # 将每条字典数据添加进列表 self.item_list.append(data_dict) # 判断是否是最后一页,条件:是否有noactive值 # 先找到下一页的标签 next_label = bs.select('#next') # 根据标签获取属性class的值-返回结果为列表 judge = next_label[0].get('class') return judge # 写入文件 def write_file(self): # 将列表转换成字符串 data_str = json.dumps(self.item_list) with open('04tencent_hr.json', 'w') as f: f.write(data_str) # 调度运行 def run(self): while True: # 拼接参数 params = { "keywords": "python", "tid": "0", "lid": "2156", "start": self.page, } # 发送请求 data = self.send_request(self.base_url, params=params) # 解析数据 judge = self.parse_data(data) self.page += 10 print self.page # 如果到了最后一页,出现noactive,跳出循环 if judge: break self.write_file() if __name__ == '__main__': spider = tencent_hr() spider.run()
原文链接:https://www.cnblogs.com/bkwxx/p/10619180.html
如有疑问请与原作者联系
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
- 使用scrapy框架爬取全书网书籍信息。 2019-08-13
- python爬虫学习之爬取超清唯美壁纸 2019-08-13
- xpath+多进程爬取八零电子书百合之恋分类下所有小说。 2019-08-13
- xpath+多进程爬取全书网纯爱耽美类别的所有小说。 2019-08-13
- 电影天堂爬虫 2019-07-24
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash