我的第一个python爬虫
2018-06-18 03:18:34来源:未知 阅读 ()
我的第一个爬虫,哈哈,纯面向过程
实现目标:
1.抓取本地conf文件,其中的URL地址,然后抓取视频名称以及对应的下载URL
2.抓取URL会单独写在本地路径下,以便复制粘贴下载
废话补多少,代码实现效果如下:
代码如下:
1 #!/usr/local/python/bin/python3 2 3 import requests 4 import re 5 import chardet 6 import random 7 import signal 8 import time 9 import os 10 import sys 11 12 def DealwithURL(url): 13 r = requests.get(url) 14 pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=') 15 findurl=re.findall(pattern,r.text) 16 17 if findurl: 18 pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"') 19 20 transferurl = re.findall(pattern,r.text)[0] 21 22 return transferurl 23 else : 24 return True 25 26 def GetNewURL(url): 27 r = requests.get(url) 28 29 r.encoding='utf-8' 30 pattern = re.compile('alert(.*)">') 31 findurl=re.findall(pattern,r.text) 32 findurl_str = (" ".join(findurl)) 33 return (findurl_str.split(' ',1)[0][2:]) 34 35 def gettrueurl(url): 36 if DealwithURL(url)==True: 37 return url 38 else : 39 return GetNewURL(DealwithURL(url)) 40 41 def SaveLocalUrl(untreatedurl,treatedurl): 42 if untreatedurl == treatedurl : 43 pass 44 else : 45 try: 46 fileconf = open(r'main.conf','r') 47 rewritestr = "" 48 49 for readline in fileconf: 50 if re.search(untreatedurl,readline): 51 readline = re.sub(untreatedurl,treatedurl,readline) 52 rewritestr = rewritestr + readline 53 else : 54 rewritestr = rewritestr + readline 55 fileconf.close() 56 57 fileconf = open(r'main.conf','w') 58 fileconf.write(rewritestr) 59 fileconf.close() 60 61 except: 62 print ("get new url but open files ng write to logs") 63 64 def handler(signum,frame): 65 raise AssertionError 66 67 def WriteLocalDownloadURL(downfile,downurled2k): 68 urlfile = open(downfile,'a+') 69 70 urlfile.write(downurled2k+'\n') 71 72 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers): 73 downurlstr = (" ".join(titleurl)) 74 downnamestr = (" ".join(titlename)) 75 76 r = requests.get((sourceurl+downurlstr),headers) 77 pattern = re.compile('autocomplete="on">(.*)/</textarea></div>') 78 79 downurled2k = re.findall(pattern,r.text) 80 downurled2kstr = (" ".join(downurled2k)) 81 82 WriteLocalDownloadURL(update_file , downurled2kstr) 83 84 print (downnamestr , downurled2kstr) 85 86 87 def ReadLocalFiles() : 88 returndict={} 89 localfiles = open(r'main.conf') 90 91 readline = localfiles.readline().rstrip() 92 93 while readline : 94 if readline.startswith('#'): 95 pass 96 else: 97 try: 98 readline = readline.rstrip() 99 returndict[readline.split('=')[0]] = readline.split('=')[1] 100 except: 101 print ("Please Check your conf %s" %(readline)) 102 sys.exit(1) 103 readline = localfiles.readline().rstrip() 104 105 localfiles.close() 106 return returndict 107 108 109 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers): 110 111 if total >= 100: 112 total = 100 113 114 if total <= 1: 115 total = 2 116 117 getpagenumber = total 118 119 for number in range(0,total) : 120 try: 121 signal.signal(signal.SIGALRM,handler) 122 signal.alarm(3) 123 124 125 url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html' 126 127 r = requests.get(url,headers) 128 129 pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>') 130 r.encoding = chardet.detect(r.content)['encoding'] 131 allurl = re.findall(pattern,r.text) 132 133 134 for lineurl in allurl: 135 try: 136 signal.signal(signal.SIGALRM,handler) 137 signal.alarm(3) 138 139 pattern = re.compile('<a href="(.*)" title') 140 titleurl = re.findall(pattern,lineurl) 141 142 pattern = re.compile('title="(.*)" target=') 143 titlename = re.findall(pattern,lineurl) 144 145 GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers) 146 signal.alarm(0) 147 except AssertionError: 148 print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished") 149 continue 150 151 # title = '/list/'+str(random.randint(1,8)) 152 # print (title) 153 # print (title_header) 154 155 except AssertionError: 156 print ("GetlistURL Infor Error") 157 continue 158 159 def GetTitleInfo(url,down_page,update_file,headers): 160 161 title = '/list/'+str(random.randint(1,8)) 162 163 titleurl = url + title + '.html' 164 165 r = requests.get(titleurl,headers) 166 r.encoding = chardet.detect(r.content)['encoding'] 167 pattern = re.compile(' 当前:.*/(.*)页 ') 168 getpagenumber = re.findall(pattern,r.text) 169 170 getpagenumber = (" ".join(getpagenumber)) 171 172 GetListURLinfo(url , title , int(getpagenumber) , int(down_page),update_file,headers) 173 174 175 def write_logs(time,logs): 176 loginfo = str(time)+logs 177 try: 178 logfile = open(r'logs','a+') 179 logfile.write(loginfo) 180 logfile.close() 181 except: 182 print ("Write logs error,code:154") 183 184 185 def DeleteHisFiles(update_file): 186 if os.path.isfile(update_file): 187 try: 188 download_files = open(update_file,'r+') 189 download_files.truncate() 190 download_files.close() 191 except: 192 print ("Delete " + update_file + "Error --code:166") 193 else : 194 print ("Build New downfiles") 195 196 def main(): 197 headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,*/*;q=0.8"} 198 199 readconf = ReadLocalFiles() 200 201 try: 202 file_url = readconf['url'] 203 down_page = readconf['download_page'] 204 update_file = readconf['download_local_files'] 205 except: 206 print ("Get local conf error,please check it") 207 sys.exit(-1) 208 209 DeleteHisFiles(update_file) 210 211 untreatedurl = file_url 212 213 treatedurl = gettrueurl(untreatedurl) 214 SaveLocalUrl(untreatedurl,treatedurl) 215 216 url = treatedurl 217 218 GetTitleInfo(url,int(down_page),update_file,headers) 219 220 221 if __name__=="__main__": 222 main()
对应的main.conf如下:
本着对爬虫的好奇来写下这些代码,如有对代码感兴趣的,可以私聊提供完整的conf信息,毕竟,我也是做运维的,要脸,从不用服务器下片。
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
- python3基础之“术语表(2)” 2019-08-13
- python3 之 字符串编码小结(Unicode、utf-8、gbk、gb2312等 2019-08-13
- Python3安装impala 2019-08-13
- 小白如何入门 Python 爬虫? 2019-08-13
- python_字符串方法 2019-08-13
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash