使用到的网站:
公考资料
http://www.chinagwy.org/html/stzx/gj/14_1.html
网站如上,经验主义告诉我,最后的1代表页码,于是我尝试了,发现第二页
http://www.chinagwy.org/html/stzx/gj/14_2.html
果然,于是开始解析网页。
这里使用beautifulsoup库和requests库来爬取网站内容
解析的过程就不说了,自己去网站打开看一下就知道了,不懂的话可以问我。
下面直接贴上源码
#获取往年真题卷子.py
'''
使用的网站:http://www.chinagwy.org/html/stzx/gj/14_1.html
@author:海hong啊
'''
import requests
from bs4 import BeautifulSoup
import random, time, re
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
"Accept-Encoding": 'gzip, deflate, br',
'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3',
'Upgrade-Insecure-Requests': '1'
}
ip_list = []
def download_pdf(pdf_herf, title):
pdf_format = pdf_herf[pdf_herf.rfind('.') + 0:]
filename = title + pdf_format
if '/' in filename:
print('非文件')
else:
print(filename)
requests_pdf = requests.get(pdf_herf, headers=headers)
F = open(filename, 'wb+')
F.write(requests_pdf.content)
print('完成')
def get_href(url, proxies):
response = requests.get(url, headers = headers)
response.encoding = 'utf-8'
if response.status_code == 200:
content = BeautifulSoup(response.text, 'lxml')
#hrefs_s = content.select('td[class=t10l14bl] a')
hrefs = content.select('ul[class=list01] li a')
#print(hrefs)
#time.sleep(1000)
for i in range(len(hrefs)):
href = hrefs[i].get('href')
if '/html/stzx/gj/index.html' in href or 'index' in href:
pass
else:
url_IDs.append(href)
#print(url_IDs)
#time.sleep(10)
return url_IDs
def get_info(url_ID, proxies):
txt = ''
time_work = ''
response = requests.get(url_ID, headers = headers)
response.encoding = 'utf-8'
if response.status_code == 200:
content = BeautifulSoup(response.text, 'lxml')
try:
pdf_herf = content.select('div[class=c_l_c_06_5] p a')[1]
pdf_herf = pdf_herf.get('href')
except Exception as e:
try:
pdf_herf = content.select('div[class=c_l_c_06_5] p a')[0]
pdf_herf = pdf_herf.get('href')
except Exception as e:
try:
pdf_herf = content.select('div[class=c_l_c_06_5] div a')
pdf_herf = pdf_herf[2].get('href')
#print(pdf_herf)
except Exception as e:
try:
pdf_herf = content.select('div[class=c_l_c_06_5] div a')
pdf_herf = pdf_herf[1].get('href')
except Exception as e:
try:
pdf_herf = content.select('div[class=c_l_c_06_5] span span a')
pdf_herf = pdf_herf[0].get('href')
#print(pdf_herf)
except Exception as e:
pdf_herf = content.select('div[class=c_l_c_06_5] a')
pdf_herf = pdf_herf[0].get('href')
#words = content.select('div[class=box_con] p')
title = str(content.select('h1')[0])
title = title[title.rfind('年') + -4:]
title = title[0: title.rfind('</h1>')]
#time.sleep(1000)
print(pdf_herf,'\n', title)
download_pdf(pdf_herf, title)
print('__________________________________________')
#time.sleep(1000)
def main(url):
ip_list = [
{'http': 'http://202.43.183.210:8080'},
{'http': 'http://1.179.144.182:80'},
{'http': 'http://182.253.181.10:8080'},
{'http': 'http://117.239.54.92:3128'},
{'http': 'http://94.140.208.226:8080'},
{'http': 'http://200.116.176.77:8080'},
{'http': 'http://1.0.184.168:8080'},
{'http': 'http://134.35.174.4:8080'},
]
for i in range(0, 5):
url = 'http://www.chinagwy.org/html/stzx/gj/14_%s.html'%str(i)
proxies = random.choice(ip_list)
get_href(url, proxies)
time.sleep(5)
print('1')
f = open('真题_url_id.txt', 'rb')
f_id = str(f.read())
print(f_id)
f.close()
for ID in url_IDs:
if ID in f_id :
#print('in')
pass
else:
print(ID)
proxies = random.choice(ip_list)
get_info(ID, proxies)
f = open('真题_url_id.txt', 'a+', encoding = 'utf-8')
f.write(ID)
f.close()
time.sleep(10)
if __name__ == '__main__':
url_IDs = []
REdate_IDs = []
url = 'http://www.chinagwy.org/html/stzx/gj/14_%s.html'%str(1)
main(url)
url_IDs.clear()
REdate_IDs.clear()
大家可以在里面看到我用了很多的try,而且还是嵌套的,这个是爬取了多次之后,一次次发现,网站每次维护都会改变下载链接在前端源码中的位置,所以要用到多个try。这也是最让我心痛的地方了,在这里想了很久,试了很多次才完成了这些内容。另外,下载的文件有的不是pdf而是word,原本我只是用文件名后缀作为pdf的,后来发现有一些文件打不开,推测是文件格式问题,然后去找了下那个网址,发现下载的是doc而不是pdf,因此,改进了代码中的文件后缀名获取方式。这里由原本的机械性质的pdf格式变成正则表达式去获取他的文件后缀名。
希望对大家有用,大家有问题也可以留言问我。