我的第一个爬虫,哈哈,纯面向过程
实现目标:
1.抓取本地conf文件,其中的URL地址,然后抓取视频名称以及对应的下载URL
2.抓取URL会单独写在本地路径下,以便复制粘贴下载
废话补多少,代码实现效果如下:
代码如下:
1 #!/usr/local/python/bin/python3
2
3 import requests
4 import re
5 import chardet
6 import random
7 import signal
8 import time
9 import os
10 import sys
11
12 def DealwithURL(url):
13 r = requests.get(url)
14 pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=')
15 findurl=re.findall(pattern,r.text)
16
17 if findurl:
18 pattern = re.compile('<meta http-equiv="refresh" content="0.1;url=(.*)"')
19
20 transferurl = re.findall(pattern,r.text)[0]
21
22 return transferurl
23 else :
24 return True
25
26 def GetNewURL(url):
27 r = requests.get(url)
28
29 r.encoding='utf-8'
30 pattern = re.compile('alert(.*)">')
31 findurl=re.findall(pattern,r.text)
32 findurl_str = (" ".join(findurl))
33 return (findurl_str.split(' ',1)[0][2:])
34
35 def gettrueurl(url):
36 if DealwithURL(url)==True:
37 return url
38 else :
39 return GetNewURL(DealwithURL(url))
40
41 def SaveLocalUrl(untreatedurl,treatedurl):
42 if untreatedurl == treatedurl :
43 pass
44 else :
45 try:
46 fileconf = open(r'main.conf','r')
47 rewritestr = ""
48
49 for readline in fileconf:
50 if re.search(untreatedurl,readline):
51 readline = re.sub(untreatedurl,treatedurl,readline)
52 rewritestr = rewritestr + readline
53 else :
54 rewritestr = rewritestr + readline
55 fileconf.close()
56
57 fileconf = open(r'main.conf','w')
58 fileconf.write(rewritestr)
59 fileconf.close()
60
61 except:
62 print ("get new url but open files ng write to logs")
63
64 def handler(signum,frame):
65 raise AssertionError
66
67 def WriteLocalDownloadURL(downfile,downurled2k):
68 urlfile = open(downfile,'a+')
69
70 urlfile.write(downurled2k+'\n')
71
72 def GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers):
73 downurlstr = (" ".join(titleurl))
74 downnamestr = (" ".join(titlename))
75
76 r = requests.get((sourceurl+downurlstr),headers)
77 pattern = re.compile('autocomplete="on">(.*)/</textarea></div>')
78
79 downurled2k = re.findall(pattern,r.text)
80 downurled2kstr = (" ".join(downurled2k))
81
82 WriteLocalDownloadURL(update_file , downurled2kstr)
83
84 print (downnamestr , downurled2kstr)
85
86
87 def ReadLocalFiles() :
88 returndict={}
89 localfiles = open(r'main.conf')
90
91 readline = localfiles.readline().rstrip()
92
93 while readline :
94 if readline.startswith('#'):
95 pass
96 else:
97 try:
98 readline = readline.rstrip()
99 returndict[readline.split('=')[0]] = readline.split('=')[1]
100 except:
101 print ("Please Check your conf %s" %(readline))
102 sys.exit(1)
103 readline = localfiles.readline().rstrip()
104
105 localfiles.close()
106 return returndict
107
108
109 def GetListURLinfo(sourceurl , title , getpagenumber , total,update_file,headers):
110
111 if total >= 100:
112 total = 100
113
114 if total <= 1:
115 total = 2
116
117 getpagenumber = total
118
119 for number in range(0,total) :
120 try:
121 signal.signal(signal.SIGALRM,handler)
122 signal.alarm(3)
123
124
125 url = sourceurl + title + '-' + str(random.randint(1,getpagenumber)) + '.html'
126
127 r = requests.get(url,headers)
128
129 pattern = re.compile('<div class="info"><h2>(.*)</a><em></em></h2>')
130 r.encoding = chardet.detect(r.content)['encoding']
131 allurl = re.findall(pattern,r.text)
132
133
134 for lineurl in allurl:
135 try:
136 signal.signal(signal.SIGALRM,handler)
137 signal.alarm(3)
138
139 pattern = re.compile('<a href="(.*)" title')
140 titleurl = re.findall(pattern,lineurl)
141
142 pattern = re.compile('title="(.*)" target=')
143 titlename = re.findall(pattern,lineurl)
144
145 GetDownloadURL(sourceurl,titleurl,titlename,update_file,headers)
146 signal.alarm(0)
147 except AssertionError:
148 print (lineurl,titlename , "Timeout Error: the cmd 10s have not finished")
149 continue
150
151 # title = '/list/'+str(random.randint(1,8))
152 # print (title)
153 # print (title_header)
154
155 except AssertionError:
156 print ("GetlistURL Infor Error")
157 continue
158
159 def GetTitleInfo(url,down_page,update_file,headers):
160
161 title = '/list/'+str(random.randint(1,8))
162
163 titleurl = url + title + '.html'
164
165 r = requests.get(titleurl,headers)
166 r.encoding = chardet.detect(r.content)['encoding']
167 pattern = re.compile(' 当前:.*/(.*)页 ')
168 getpagenumber = re.findall(pattern,r.text)
169
170 getpagenumber = (" ".join(getpagenumber))
171
172 GetListURLinfo(url , title , int(getpagenumber) , int(down_page),update_file,headers)
173
174
175 def write_logs(time,logs):
176 loginfo = str(time)+logs
177 try:
178 logfile = open(r'logs','a+')
179 logfile.write(loginfo)
180 logfile.close()
181 except:
182 print ("Write logs error,code:154")
183
184
185 def DeleteHisFiles(update_file):
186 if os.path.isfile(update_file):
187 try:
188 download_files = open(update_file,'r+')
189 download_files.truncate()
190 download_files.close()
191 except:
192 print ("Delete " + update_file + "Error --code:166")
193 else :
194 print ("Build New downfiles")
195
196 def main():
197 headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome","Accept": "text/html,application/xhtml+xml,application/xml; q=0.9,image/webp,*/*;q=0.8"}
198
199 readconf = ReadLocalFiles()
200
201 try:
202 file_url = readconf['url']
203 down_page = readconf['download_page']
204 update_file = readconf['download_local_files']
205 except:
206 print ("Get local conf error,please check it")
207 sys.exit(-1)
208
209 DeleteHisFiles(update_file)
210
211 untreatedurl = file_url
212
213 treatedurl = gettrueurl(untreatedurl)
214 SaveLocalUrl(untreatedurl,treatedurl)
215
216 url = treatedurl
217
218 GetTitleInfo(url,int(down_page),update_file,headers)
219
220
221 if __name__=="__main__":
222 main()
对应的main.conf如下:
本着对爬虫的好奇来写下这些代码,如有对代码感兴趣的,可以私聊提供完整的conf信息,毕竟,我也是做运维的,要脸,从不用服务器下片。