【Python多线程】听力

最新推荐文章于 2024-07-13 21:34:41 发布

fangqileo

最新推荐文章于 2024-07-13 21:34:41 发布

阅读量137

点赞数

文章标签： python 爬虫多线程正则表达式

本文链接：https://blog.csdn.net/fangqileo/article/details/121022568

版权

导入模块

import requests
import random
from pyquery import PyQuery as pq
import re
import threading
import os

设置ua，这里不赘述。

这里每一个选项都是一个列表页，首先我们获取所有列表页的url。

这里用pyquery找到所有a节点后，items（）生成器，之后用列表解析直接返回一个列表。

def get_page():
	url = 'https://top.zhan.com/toefl/listen/alltpo.html'
	response = requests.get(url=url,headers=headers)
	html = response.text
	doc = pq(html)
	page_urls = doc('.cssTopTitleList.clearfix a').items()
	urls = [i.attr.href for i in page_urls]
	print(urls)
	return urls

接下来，用正则表达找到每个列表页中所有详情页（每个tpo中的每一个大题）的url。

用正则表达式找到，findall返回一个列表。

def get_tpo_url(url):
	response = requests.get(url=url,headers=headers)
	html = response.text
	doc = pq(html)
	if response.status_code == 200:
		url_pattern = re.compile('<a class="md_click sensors_maidian cssReview"[\s\S]*?href="(.*?)" target="_blank" event_type="E_5_14">学习/回顾</a>')
		urls = url_pattern.findall(html)
		#返回每个列表页所有单题页url
		print(urls)
		return urls

进到每个答题里面，发现还有5-6道小题，每道题都有自己的url，这里找到每个大题中所有小题的url，并综合成一个列表返回。

#获取每一个set题url内所有题目url。返回的是每一个set大题里所有小题的urls的列表
def get_question_page(url):
	response = requests.get(url=url,headers=headers)
	html = response.text
	doc = pq(html)
	question_urls = doc('#footer_review a').items()
	question_urls = [url.attr.href for url in question_urls]
	print(question_urls)
	#每一个set题所有分题url的综合
	parse_page(question_urls)

拿到每个小题的url就好办了，这里我们要获取的是原文、问题、选项、副标题。

def parse_page(set_question_urls_list):
	response = requests.get(url=set_question_urls_list[0],headers=headers)
	html = response.text
	doc = pq(html)
	title = doc('title').text()
	title = title.split(' ')[0]
	article = doc('.article').text()
	subtitle = doc('.arrow .last_crumbs').text()
	#这个列表是每一个set中所有（题目+选项）的综合
	set_que_plus_opt = []
	for url in set_question_urls_list:
		response = requests.get(url=url,headers=headers)
		html = response.text
		doc = pq(html)
		question = doc('.left.text').text()
		options = doc('.ops.sec ').items()
		options_list = [option.text() for option in options]
		question_plus_options = [question,options_list]
		set_que_plus_opt.append(question_plus_options)

	print(title,'\n',article,'\n',set_que_plus_opt)

	save_txt(title,article,set_que_plus_opt,subtitle)

	semaphore.release()

定义存储函数，这里将文档存储为txt，将每个tpo对应的所有大题存储成一个txt，方便查阅。

def save_txt(title,article,set_que_plus_opt,subtitle):
	path = 'tpo_listening2'
	if os.path.exists(path) == False:
		os.mkdir(path)
	li = []
	for q_opt_set in set_que_plus_opt:
		q = ', '.join(q_opt_set[1])
		q_opt_set = q_opt_set[0] + '\n' + q
		li.append(q_opt_set)

	li_str = '\n'.join(li)
	li_str = li_str.replace('1.','\n1.').replace('2.','\n2.').replace('3.','\n3.').replace('4.','\n4.').replace('A. ','\nA. ').replace('B. ','\nB. ').replace('C. ','\nC. ').replace('D. ','\nD. ')\
	.replace('5.','\n5.').replace('6.','\n6.')

	with open(f"{path}/{title}.txt",'a+') as f:
		f.write(subtitle)
		f.write('\n\n')
		f.write(article)
		f.write('\n'*50)
		f.write(li_str)
		f.write('\n'*50)

最后定义主函数。开启多线程，BoundedSemaphore限制并发数。全部用时92.3s。

if __name__ == '__main__':
	urls = get_page()

	total_lis_set_urls = []
	for url in urls:
		#每个列表页所有set题url
		listening_set_urls = get_tpo_url(url)
		#将每个列表页所有set题url汇总至total
		total_lis_set_urls.extend(listening_set_urls)

	semaphore = threading.BoundedSemaphore(20)
	record_threads = []
	for i in total_lis_set_urls:
		semaphore.acquire()
		t1 = threading.Thread(target=get_question_page,args=(i,))
		t1.start()
		record_threads.append(t1)

成果展示。

fangqileo

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【Python多线程】听力

导入模块import requestsimport randomfrom pyquery import PyQuery as pqimport reimport threadingimport os设置ua，这里不赘述。这里每一个选项都是一个列表页，首先我们获取所有列表页的url。这里用pyquery找到所有a节点后，items（）生成器，之后用列表解析直接返回一个列表。def get_page(): url = 'https://top.zhan.com/toef.
复制链接

扫一扫