下面是初学了多线程,下手写的一个小爬虫,代码个人看着不那么优雅,无奈水平不够
望您发现有误或者有改进的地方,能指正
import urllib.request as ur
import re
from threading import Thread as tt
from queue import Queue as qq
# 获取html代码
def get_html(url):
res = ur.Request(url)
res.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
page = ur.urlopen(res)
html = page.read().decode('utf-8')
return html
# 正则表达式匹配的数据
def match_data(html):
data_list = [res_title, res_price, res_mall]
data = [each.findall(html) for each in data_list]
txt = zip(data[2], data[0], data[1])
return txt
# 地址列表
def urls(home):
url = [home[j] + str(i) for i in range(1, 11) for j in range(4)]
return url
# 把数据写进txt
def write_data(data):
with open('smzdm.txt', 'a+', encoding='utf-8') as f:
for i in data:
f.write('----'.join(i))
f.write('\n')
# 编译正则
res_title = re.compile(r';" >(.*?)<span class="red"')
res_price = re.compile(r'class="red">(.*?)</span')
res_mall = re.compile(r'class="mall">(.*?)</a>')
# 地址是复制的,“什么值得买”web
home = ['http://www.smzdm.com/youhui/shangpin/%E4%BA%AC%E4%B8%9C/p', 'http://www.smzdm.com/youhui/shangpin/%E4%BA%9A%E9%A9%AC%E9%80%8A%E4%B8%AD%E5%9B%BD/p', 'http://www.smzdm.com/youhui/shangpin/%E5%A4%A9%E7%8C%AB%E7%B2%BE%E9%80%89/p', 'http://www.smzdm.com/youhui/shangpin/%E8%81%9A%E5%88%92%E7%AE%97/p']
url = urls(home)
queue_url = qq() # 搞一个队列, 默认是'先进先出'
for i in url:
queue_url.put_nowait(i)
# 主控制函数
def main():
xurl = queue_url.get_nowait()
html = get_html(xurl)
data = match_data(html)
write_data(data)
# 多线程
while not queue_url.empty():
th = tt(target=main)
th.start()
th.join()