我的第一个python爬虫

2018-06-18 03:18:34来源:未知 阅读 ()

新老客户大回馈,云服务器低至5折

我的第一个爬虫,哈哈,纯面向过程

实现目标:

  1.抓取本地conf文件,其中的URL地址,然后抓取视频名称以及对应的下载URL

  2.抓取URL会单独写在本地路径下,以便复制粘贴下载

废话补多少,代码实现效果如下:

 

代码如下:

  1 #!/usr/local/python/bin/python3
  2 
  3 import requests
  4 import re
  5 import chardet
  6 import random
  7 import signal
  8 import time
  9 import os
 10 import sys
 11 
 12 def DealwithURL(url):
 13     r = requests.get(url)
 14     pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=')
 15     findurl=re.findall(pattern,r.text)
 16 
 17     if findurl:
 18         pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"')
 19 
 20         transferurl = re.findall(pattern,r.text)[0]
 21 
 22         return transferurl
 23     else :
 24         return True
 25 
 26 def GetNewURL(url):
 27     r = requests.get(url)
 28 
 29     r.encoding='utf-8'
 30     pattern = re.compile('alert(.*)">')
 31     findurl=re.findall(pattern,r.text)
 32     findurl_str = (" ".join(findurl))
 33     return (findurl_str.split(' ',1)[0][2:])
 34 
 35 def gettrueurl(url):
 36     if DealwithURL(url)==True:
 37         return url
 38     else :
 39         return GetNewURL(DealwithURL(url))
 40 
 41 def SaveLocalUrl(untreatedurl,treatedurl):
 42     if untreatedurl == treatedurl :
 43         pass
 44     else :
 45         try:
 46             fileconf = open(r'main.conf','r')
 47             rewritestr = ""
 48             
 49             for readline in fileconf:
 50                 if re.search(untreatedurl,readline):
 51                     readline = re.sub(untreatedurl,treatedurl,readline)
 52                     rewritestr = rewritestr + readline
 53                 else :
 54                     rewritestr = rewritestr + readline
 55             fileconf.close()
 56             
 57             fileconf = open(r'main.conf','w')
 58             fileconf.write(rewritestr)
 59             fileconf.close()
 60             
 61         except:
 62             print ("get new url but open files ng write to logs")
 63 
 64 def handler(signum,frame):
 65     raise AssertionError
 66 
 67 def WriteLocalDownloadURL(downfile,downurled2k):
 68     urlfile = open(downfile,'a+')
 69     
 70     urlfile.write(downurled2k+'\n')
 71             
 72 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers):
 73     downurlstr = (" ".join(titleurl))
 74     downnamestr = (" ".join(titlename))
 75 
 76     r = requests.get((sourceurl+downurlstr),headers)
 77     pattern = re.compile('autocomplete="on">(.*)/</textarea></div>')
 78 
 79     downurled2k = re.findall(pattern,r.text)
 80     downurled2kstr = (" ".join(downurled2k))
 81 
 82     WriteLocalDownloadURL(update_file , downurled2kstr)
 83 
 84     print (downnamestr , downurled2kstr)
 85 
 86 
 87 def ReadLocalFiles() :
 88     returndict={}
 89     localfiles = open(r'main.conf')
 90     
 91     readline = localfiles.readline().rstrip()
 92 
 93     while readline :
 94         if readline.startswith('#'):
 95             pass
 96         else:
 97             try:
 98                 readline = readline.rstrip()
 99                 returndict[readline.split('=')[0]] = readline.split('=')[1]
100             except:
101                 print ("Please Check your conf %s" %(readline))
102                 sys.exit(1)
103         readline = localfiles.readline().rstrip()
104 
105     localfiles.close()
106     return returndict
107 
108 
109 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers):
110     
111     if total >= 100:
112         total = 100
113 
114     if total <= 1:
115         total = 2
116     
117     getpagenumber = total
118         
119     for number in range(0,total) :
120         try:
121             signal.signal(signal.SIGALRM,handler)
122             signal.alarm(3)
123         
124     
125             url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'
126 
127             r = requests.get(url,headers)
128 
129             pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>')
130             r.encoding = chardet.detect(r.content)['encoding']
131             allurl = re.findall(pattern,r.text)
132 
133 
134             for lineurl in allurl:
135                 try:
136                     signal.signal(signal.SIGALRM,handler)
137                     signal.alarm(3)
138 
139                     pattern = re.compile('<a href="(.*)" title')
140                     titleurl = re.findall(pattern,lineurl)
141         
142                     pattern = re.compile('title="(.*)" target=')
143                     titlename = re.findall(pattern,lineurl)
144 
145                     GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers)
146                     signal.alarm(0)
147                 except AssertionError:
148                     print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished")            
149                     continue
150 
151 #            title = '/list/'+str(random.randint(1,8))
152 #            print (title)
153 #            print (title_header)
154 
155         except AssertionError:
156             print ("GetlistURL Infor Error")            
157             continue
158 
159 def GetTitleInfo(url,down_page,update_file,headers):
160     
161     title = '/list/'+str(random.randint(1,8))
162     
163     titleurl = url + title + '.html'
164 
165     r = requests.get(titleurl,headers)
166     r.encoding = chardet.detect(r.content)['encoding']
167     pattern = re.compile('&nbsp;当前:.*/(.*)页&nbsp;')
168     getpagenumber = re.findall(pattern,r.text)
169     
170     getpagenumber = (" ".join(getpagenumber))
171 
172     GetListURLinfo(url , title , int(getpagenumber) , int(down_page),update_file,headers)
173 
174     
175 def write_logs(time,logs):
176     loginfo = str(time)+logs
177     try:
178         logfile = open(r'logs','a+')
179         logfile.write(loginfo)
180         logfile.close()
181     except:
182         print ("Write logs error,code:154")
183     
184 
185 def DeleteHisFiles(update_file):
186     if os.path.isfile(update_file):
187         try:
188             download_files = open(update_file,'r+')
189             download_files.truncate()
190             download_files.close()
191         except:
192             print ("Delete " + update_file + "Error --code:166")
193     else :
194         print ("Build New downfiles")
195     
196 def main():
197     headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,*/*;q=0.8"}
198 
199     readconf = ReadLocalFiles()
200     
201     try:
202         file_url = readconf['url']
203         down_page = readconf['download_page']
204         update_file = readconf['download_local_files']
205     except:
206         print ("Get local conf error,please check it")    
207         sys.exit(-1)
208 
209     DeleteHisFiles(update_file)
210 
211     untreatedurl = file_url
212     
213     treatedurl = gettrueurl(untreatedurl)
214     SaveLocalUrl(untreatedurl,treatedurl)
215 
216     url = treatedurl
217     
218     GetTitleInfo(url,int(down_page),update_file,headers)
219 
220 
221 if __name__=="__main__":
222     main()

 对应的main.conf如下:

本着对爬虫的好奇来写下这些代码,如有对代码感兴趣的,可以私聊提供完整的conf信息,毕竟,我也是做运维的,要脸,从不用服务器下片。

 

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有

上一篇:Python百度爬虫,分析各行业权重排行,数据分析工程师也不如你!

下一篇:我的第一个python web开发框架(22)——一个安全小事故