用到的基本就是之前提到的线程小框架及逻辑
# -*- encoding: utf-8 -*-
import requests
from lxml import etree
import threading
import os
class Cnbeta(object):
thread = []
#获取网站首页
def get_url(self,url):
# 如果没有文件就创建文件
if not os.path.exists('cnbeta.html'):
res = requests.get(url)
res1 = res.content.decode("utf-8")
with open('./cnbeta.html','w',encoding='utf-8') as f:
f.write(res1)
return res1
else:
with open('./cnbeta.html',encoding='utf-8') as f:
content = f.read()
return content
# 抓取内页网址
def get_data(self,html):
# 完善html标签
tree = etree.HTML(html)
url = tree.xpath("//div[@class='headline-thumb']/a/@href")
for i in url:
self.thread.append(i)
# 抓取内页内容放入文件
def get_xiang(self,res):
# 动态文件名
name = str(res).split('/')[-1].replace('.htm','')
# 请求列表中的地址
r = requests.get(res)
r1 = r.content.decode('utf-8')
path = './upload/'
with open( path+name+'.html','w',encoding='utf-8' ) as f:
print("*****")
f.write(r1)
if __name__ == "__main__":
cnbeta = Cnbeta()
html = cnbeta.get_url("https://www.cnbeta.com/")
# 将请求地址放入列表中
cnbeta.get_data(html)
print(cnbeta.thread)
for i in cnbeta.thread:
a = threading.Thread(target=cnbeta.get_xiang,args=(i,))
a.setDaemon(True)
a.start()
a.join()
爬虫有危险,抓取需谨慎