import re
import requests
import threading
import csv
import time
import queue as Queue
start = time.time()
g_writecount = 0
exist_url = []
all_url = []
def GetUrls1(url,depth = 1): #先用该深度优化递归函数获取一到三级网站形成的一个all_url列表
global g_writecount,all_url #形式为[‘01/420112.html’,‘01/420114.html’,...] 大约3300个
try:
url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + url #index.html
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url1, headers=kv ,timeout = 5)
r.raise_for_status()
r.encoding = r.apparent_encoding
except:
print(url1,'爬取失败')
exist_url.append(url)
return None
exist_url.append(url)
pattern = re.compile("<a href='(.*?)'>")
unique_list = list(set(re.findall(pattern, r.text)) - set(exist_url))
for eachone in unique_list:
g_writecount += 1
output = 'NO.' + str(g_writecount) + '\t Depth:' + str(depth) + '\t' + url + '->' + eachone + '\n'
print(output)
# with open('title1.txt', 'a+') as f:
# f.write(output)
# f.close()
if depth < 3:
GetUrls1(eachone, depth + 1)
all_url = all_url + unique_list
def GetUrls2(q):
i = q.get(timeout=2) #从all_url中一个一个获取 workQueue.put(url) 传送过来的部分url
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + i[3:5] + '/' + i #形成完整的可爬取的url
try:
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
pattern = re.compile("<a href='(.*?)'>")
result = list(set(re.findall(pattern, r.text)))
except:
print(url, '爬取失败')
for i in result: #result返回的是形如['01/110101001.html','01/110101002.html','01/110101003.html'...]的列表
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2009/' + i[3:5] + '/' + i[5:7] + '/' + i #形成完整的可爬取的第五级url
try:
GetDatas(url) #爬取第五级网站的所需信息
except:
continue #出错的话继续爬取下一个网站
def GetDatas(url): #爬取第五级网站的所需信息
try:
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
pattern = re.compile(r"<tr class='villagetr'><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>")
Data = re.findall(pattern, html)
with open('a.csv', 'a+') as f: #创建并将数据存储到csv文件
f_csv = csv.writer(f)
f_csv.writerows(Data)
f.close()
except:
print(url, '爬取失败')
class myThread (threading.Thread):
def __init__(self,q):
threading.Thread.__init__(self)
self.q = q
def run(self):
while not self.q.empty():
GetUrls2(self.q)
if __name__ == '__main__':
start = time.time()
headers = ['代码','城乡分类','名称']
with open('网站.csv', 'a+') as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f.close()
GetUrls1('index.html') # 为all_url 添砖加瓦
listNum = len(all_url)
workQueue = Queue.Queue(listNum)
threads = []
for url in all_url:
workQueue.put(url)
for i in range(5):
thread = myThread(workQueue)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
end = time.time()
print(end - start)
1.递归函数 GetUrls1(url,depth = 1 ) 可以减少相同代码的重复率,更简洁。要注意代码形式相同的才能用!
2.爬虫是IO密集型的任务,用threading多线程可节省爬取时间!同时+Queue的运用更节省时间!