多线程爬取糗事百科

1.所用到的模块
request ,queue,time,etree
2.对于多线程知识有一定了解,类的继承,异常处理等
3.构思自己大致的思路。

import threading
import queue
import requests
import time
from lxml import etree
class thread1(threading.Thread):
	def __init__(self, threadname,pagequeue,dataqueue):
		super(thread1, self).__init__()
		self.threadname = threadname
		self.pagequeue = pagequeue
		self.dataqueue = dataqueue
		self.header = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

	def run(self):
		print('启动进程',self.threadname)
		while not flag1:
			try:
				page = self.pagequeue.get()
				url = 'https://www.qiushibaike.com/8hr/page/'+str(page)+'/'
				rep = requests.get(url,headers=self.header).text
				time.sleep(0.5)
				self.dataqueue.put(rep)
			except Exception as e:
				pass

		print('结束进程',self.threadname)

class thread2(threading.Thread):

	def __init__(self,threadname,dataqueue,filename):
		super(thread2, self).__init__()
		self.dataqueue = dataqueue
		self.threadname = threadname
		self.filename = filename

	def run(self):
		print('启动进程',self.threadname)
		while not flag2:
			try:
				rep2 = self.dataqueue.get()
				html = etree.HTML(rep2)
				non_list =html.xpath('//div/a[@class="recmd-content"]')
				for x in non_list:
					self.filename.write(x+'\n')

			except Exception as e:
				pass

		print('结束进程',self.threadname)

flag1 = False

flag2 = False

def main():

	pagequeue = queue.Queue(10)

	for i in range(1,11):
		pagequeue.put(i)

	dataqueue = queue.Queue()

	filename = open(r"C:\offline_FtnInfo.txt","a")

	t1 = thread1('采集线程',pagequeue,dataqueue)
	t1.start()
	t2 = thread2('解析线程',dataqueue,filename)
	t2.start()

	while not pagequeue.empty():
		pass
	global flag1
	flag1 = True

	while not dataqueue.empty():
		pass
	global flag2
	flag2 = True

	t1.join()
	t2.join()

	filename.close()
	print('结束')



if __name__ == '__main__':
	main()


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值