古诗+代码 = 绝配

IO读写操作比较适合使用多线程,虽然python的多线程是伪多线程,但是CPU快速的在读取和写入之间切换,边爬取边写入,会比逐条进行读写的速度快。

import re
import requests
import csv
from queue  import Queue
import threading

class POSpider(threading.Thread):
	headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE','cookie':'Hm_lvt_9007fab6814e892d3020a64454da5a55=1599659724,1599704234; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1599713077'}
	def __init__(self,page_queue,poems_queue,*args,**kwargs):
		super(POSpider,self).__init__(*args,**kwargs)
		self.page_queue = page_queue
		self.poems_queue = poems_queue
	def run(self):
		sortP = []
		while True:
			if self.page_queue.empty():
				break
			url = self.page_queue.get()
			response = requests.get(url,headers=self.headers)
			text = response.text
			titles = re.findall(r'div\sclass="cont".*?<b>(.*?)</b>',text,re.S)#爬取诗的标题
			poems = re.findall(r'div\sclass="contson".*?">.*?(.*?)</div>',text,re.S) #爬取诗的内容
			for poem in poems:
				sortPoems = re.sub('<.*?>','',poem)
				sortP.append(sortPoems)  #诗中有多余的标签,用空格替换掉,遍历加入新的列表
			for x,y in zip(titles,sortP): #一次遍历两个列表
				self.poems_queue.put((x,y)) #把标题和诗放入队列
				# print(x,y) #打印爬取的标题和内容
			print('+'*30+"第%s页已下载完成!" %url.split('0AA')[-1]+'+'*30)
class POWriter(threading.Thread):
	headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE','cookie':'Hm_lvt_9007fab6814e892d3020a64454da5a55=1599659724,1599704234; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1599713077'}
	def __init__(self,poems_queue,gLock,writer,*args,**kwargs):
			super(POWriter,self).__init__(*args,**kwargs)
			self.poems_queue = poems_queue
			self.writer = writer
			self.lock = gLock
	def run(self):
		while True:
			try:
				poem_info = self.poems_queue.get(timeout=50)#获取到队列,超时
				x,y = poem_info
				self.lock.acquire()
				self.writer.writerow((x,y)) #一次写入一首诗
				self.lock.release()
				print('保存一首')
			except:
				break

def main():
	page_queue = Queue(10)
	poems_queue = Queue(30)
	gLock = threading.Lock()
	fp = open('chinapoems.csv','a',newline='',encoding='gbk')#用utf-8会乱码,gb2312部分不能写入
	writer = csv.writer(fp)
	writer.writerow(('title','content'))
	for x in range(1,6):
		url = 'https://so.gushiwen.cn/shiwen/default_0AA%d.aspx' % x
		page_queue.put(url)
	for x in range(5):
		t = POSpider(page_queue,poems_queue)
		t.start()
	for x in range(5):
		t = POWriter(poems_queue,gLock,writer)
		t.start()
if __name__ == "__main__":
	main()

爬取的内容不存在问题,遍历的页面过多,写入到CSV速度很慢,而且会假死。分析大概是有比较复杂的中文字体,在编码的时候,写入困难。
在这里插入图片描述

事实证明的确是因为曹操诗句中有个复杂的汉字:䜩,以至于进行不下去。

UnicodeEncodeError details: ‘gbk’ codec can’t encode character ‘\u4729’ in position 126: illegal multibyte sequence 短歌行 两汉:曹操
对酒当歌,人生几何!譬如朝露,去日苦多。慨当以慷,忧思难忘。何以解忧?唯有杜康。青青子衿,悠悠我心。但为君故,沉吟至今。呦呦鹿鸣,食野之苹。我有嘉宾,鼓瑟吹笙。明明如月,何时可掇?忧从中来,不可断绝。越陌度阡,枉用相存。契阔谈䜩,心念旧恩。(谈䜩 一作:谈宴)月明星稀,乌鹊南飞。绕树三匝,何枝可依?山不厌高,海不厌深。周公吐哺,天下归心。

import re
import requests

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 QIHU 360SE','cookie':'Hm_lvt_9007fab6814e892d3020a64454da5a55=1599659724,1599704234; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1599713077'}
def parse_page(url):
	poemsG = []
	authorG = []
	response = requests.get(url,headers=headers)
	text = response.text
	titles = re.findall(r'div\sclass="cont".*?<b>(.*?)</b>',text,re.S)#爬取诗的标题
	poems = re.findall(r'div\sclass="contson".*?">.*?(.*?)</div>',text,re.S) #爬取诗的内容
	authors = re.findall(r'p\sclass="source".*?>.*?(.*?)</p>',text,re.S)
	for i in authors:
		author = re.sub('<.*?>','',i)
		authorG.append(author)
	for i in poems:
		Poem = re.sub('<.*?>','',i)
		poemsG.append(Poem)  #诗中有多余的标签,用空格替换掉,遍历加入新的列表
	g = list(map(lambda x,y,z:'\t'+x+'\t\t'+y+z+'\n',titles,authorG,poemsG)) #列表一一映射组合在一起成新列表,合适的组合空格加上回车
	return g


def  main():
	for x in range(1,40):
		url = 'https://so.gushiwen.cn/shiwen/default_0AA%d.aspx' % x
		contents = parse_page(url)
		for i in contents:
			try:
				with open('chinapoems'+'.txt','a') as f:# write()不能是字典、元组和列表
					f.write(i)
			except UnicodeEncodeError as e:
				print("UnicodeEncodeError details: " +str(e)+i) #显示到底是哪里出错了
				pass

if __name__ == "__main__":
	main()
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值