使用多线程爬取段子
请求网页响应获取源码
由于使用多线程代码比较多,所以定义一个采集类
class CrawlThread(threading.Thread):
def __init__(self, name, url_queue, data_queue):
super(CrawlThread, self).__init__()
self.name = name
self.url_queue = url_queue
self.data_queue = data_queue
self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
def run(self):
print('%s--------线程启动' % self.name)
while True:
# 判断采集线程何时结束
if self.url_queue.empty():
break
url = self.url_queue.get()
print(url)
resp = requests.get(url, headers=self.headers)
html = resp.content.decode('utf-8')
self.data_queue.put(html)
print("%s--------结束" % self.name)
解析并提取需要的内容保存到文件中
class ParseThread(threading.Thread):
def __init__(self, name,data_queue, lock, fp):
super(ParseThread, self).__init__()
self.name = name
self.data_queue = data_queue
self.lock = lock
self.fp = fp
def run(self):
print('%s-------线程启动' % self.name)
while True:
try:
data = self.data_queue.get(True, 10)
self.parse_html(data)
except Exception:
break
print('%s-------线程结束' % self.name)
def parse_html(self, data):
html = etree.HTML(data)
result = html.xpath('//*[@id="content"]')[0]
articles = result.xpath('./article')
details = []
for article in articles:
text = article.xpath('.//p/text()')
# print(text)
dianzanshu = article.xpath('./div//span/text()')[2]
# print(dianzanshu)
detail_url = article.xpath('.//div[@class="ll_tu"]/a/@href')[0]
# print(url)
detail = {
'内容': text,
'点赞数': dianzanshu,
'详情链接': detail_url
}
details.append(detail)
# 写到文件
self.lock.acquire()
w = csv.DictWriter(self.fp, fieldnames=['内容', '点赞数', '详情链接'])
w.writeheader()
w.writerows(details)
self.lock.release()
此外,代码比较多为了看着方便,又定义了几个函数
1.创建队列的函数
def creat_queue():
url_queue = queue.Queue()
base_url = 'https://wengpa.com/duanzi/page/{}/'
for x in range(1, 21):
url = base_url.format(x)
url_queue.put(url)
data_queue = queue.Queue(1000)
return url_queue, data_queue
2.创建采集和解析的线程的两个函数
parse_thread_list = []
crawlthread_list = []
# 创建采集线程
def create_crawlthread(url_queue, data_queue):
names = ["采集线程1", "采集线程2", "采集线程3"]
for name in names:
th1 = CrawlThread(name, url_queue, data_queue)
crawlthread_list.append(th1)
# return crawlthread_list
# 创建解析线程
def creat_parse_thread(data_queue, lock, fp):
parsenames = ["解析线程1", "解析线程2", "解析线程3"]
for parsename in parsenames:
th2 = ParseThread(parsename, data_queue, lock, fp)
parse_thread_list.append(th2)
最后定义一个主函数把这些内容代码串起来就行了