python爬取公考真题

最新推荐文章于 2022-01-18 15:35:08 发布

海hong

最新推荐文章于 2022-01-18 15:35:08 发布

阅读量4.8k

点赞数 1

分类专栏：我的python学习日记文章标签： python

本文链接：https://blog.csdn.net/haihonga/article/details/105349877

版权

我的python学习日记专栏收录该内容

6 篇文章 0 订阅

订阅专栏

使用到的网站：
公考资料
http://www.chinagwy.org/html/stzx/gj/14_1.html
网站如上，经验主义告诉我，最后的1代表页码，于是我尝试了，发现第二页
http://www.chinagwy.org/html/stzx/gj/14_2.html
果然，于是开始解析网页。

这里使用beautifulsoup库和requests库来爬取网站内容
解析的过程就不说了，自己去网站打开看一下就知道了，不懂的话可以问我。
下面直接贴上源码

#获取往年真题卷子.py
'''
使用的网站：http://www.chinagwy.org/html/stzx/gj/14_1.html
@author:海hong啊
'''
import requests
from bs4 import BeautifulSoup
import random, time, re

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
	"Accept-Encoding": 'gzip, deflate, br',
	'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3',
	'Upgrade-Insecure-Requests': '1'
}
ip_list = []

def download_pdf(pdf_herf, title):
	pdf_format = pdf_herf[pdf_herf.rfind('.') + 0:]
	filename = title + pdf_format
	if '/' in filename:
		print('非文件')
	else:
		print(filename)
		requests_pdf = requests.get(pdf_herf, headers=headers)
		F = open(filename, 'wb+')
		F.write(requests_pdf.content)
		print('完成')

def get_href(url, proxies):
	
	response = requests.get(url, headers = headers)
	response.encoding = 'utf-8'
	if response.status_code == 200:
		content = BeautifulSoup(response.text, 'lxml')
		#hrefs_s = content.select('td[class=t10l14bl] a')
		hrefs = content.select('ul[class=list01] li a')
		#print(hrefs)
		#time.sleep(1000)
		for i in range(len(hrefs)):
			href = hrefs[i].get('href')
			if '/html/stzx/gj/index.html' in href or 'index' in href:
				pass
			else:
				url_IDs.append(href)
		#print(url_IDs)
		#time.sleep(10)
	return url_IDs

def get_info(url_ID, proxies):
	txt = ''
	time_work = ''
	response = requests.get(url_ID, headers = headers)
	response.encoding = 'utf-8'
	if response.status_code == 200:
		content = BeautifulSoup(response.text, 'lxml')
		try:
			pdf_herf = content.select('div[class=c_l_c_06_5] p a')[1]
			pdf_herf = pdf_herf.get('href')
		except Exception as e:
			try:
				pdf_herf = content.select('div[class=c_l_c_06_5] p a')[0]
				pdf_herf = pdf_herf.get('href')
			except Exception as e:
				try:
					pdf_herf = content.select('div[class=c_l_c_06_5] div a')
					pdf_herf = pdf_herf[2].get('href')
					#print(pdf_herf)
				except Exception as e:
					try:
						pdf_herf = content.select('div[class=c_l_c_06_5] div a')
						pdf_herf = pdf_herf[1].get('href')
					except Exception as e:
						try:
							pdf_herf = content.select('div[class=c_l_c_06_5] span span a')
							pdf_herf = pdf_herf[0].get('href')
							#print(pdf_herf)
						except Exception as e:
							pdf_herf = content.select('div[class=c_l_c_06_5] a')
							pdf_herf = pdf_herf[0].get('href')

		#words = content.select('div[class=box_con] p')
		title = str(content.select('h1')[0])
		title = title[title.rfind('年') + -4:]
		title = title[0: title.rfind('</h1>')]

		#time.sleep(1000)

		print(pdf_herf,'\n', title)

		download_pdf(pdf_herf, title)
	print('__________________________________________')
	#time.sleep(1000)

def main(url):
	ip_list = [
				{'http': 'http://202.43.183.210:8080'},
				{'http': 'http://1.179.144.182:80'},
				{'http': 'http://182.253.181.10:8080'},
				{'http': 'http://117.239.54.92:3128'},
				{'http': 'http://94.140.208.226:8080'},
				{'http': 'http://200.116.176.77:8080'},
				{'http': 'http://1.0.184.168:8080'},
				{'http': 'http://134.35.174.4:8080'},
	]

	for i in range(0, 5):
		url = 'http://www.chinagwy.org/html/stzx/gj/14_%s.html'%str(i)
		proxies = random.choice(ip_list)
		get_href(url, proxies)
		time.sleep(5)
		print('1')

	f = open('真题_url_id.txt', 'rb')
	f_id = str(f.read())
	print(f_id)
	f.close()
	for ID in url_IDs:
		if ID in f_id :
			#print('in')
			pass
		else:
			print(ID)
			proxies = random.choice(ip_list)
			get_info(ID, proxies)
			f = open('真题_url_id.txt', 'a+', encoding = 'utf-8')
			f.write(ID)
			f.close()
			time.sleep(10)

if __name__ == '__main__':
	url_IDs = []
	REdate_IDs = []
	url = 'http://www.chinagwy.org/html/stzx/gj/14_%s.html'%str(1)
	main(url)
	url_IDs.clear()
	REdate_IDs.clear()

大家可以在里面看到我用了很多的try，而且还是嵌套的，这个是爬取了多次之后，一次次发现，网站每次维护都会改变下载链接在前端源码中的位置，所以要用到多个try。这也是最让我心痛的地方了，在这里想了很久，试了很多次才完成了这些内容。另外，下载的文件有的不是pdf而是word，原本我只是用文件名后缀作为pdf的，后来发现有一些文件打不开，推测是文件格式问题，然后去找了下那个网址，发现下载的是doc而不是pdf，因此，改进了代码中的文件后缀名获取方式。这里由原本的机械性质的pdf格式变成正则表达式去获取他的文件后缀名。
希望对大家有用，大家有问题也可以留言问我。