因为听说某单位的网页反爬机制弱,所以我作死爬了该单位近期发布的文书信息,学习相关案例并对自己以示警戒。
首先打开该网页对应的html代码分析想要得到的信息存在哪里。
经过分析,得知这些位置是存放文书对应表项的地方。然后找页码链接,先爬下6页来学习学习。
得到这两个位置之后,就可以写代码了,下面附代码:
from bs4 import BeautifulSoup
import requests
import urllib
import os
import re
import time
import lxml
url = "http://www.*****.gov.cn/wenshu.html"#思来想去这个网址还是不能给,知道是.gov就行了
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
def get_Content(html, num):
htmldata1 = html.find_all(name='li', attrs={'class': 'list_tit'})
htmldata2 = html.find_all(name='div', attrs={'class': 'ah'})
htmldata3 = html.find_all(name='div', attrs={'class': 'date'})
data_One = []
data_Two = []
data_Three = []
data_End = []
for i in htmldata1:
data = i.get_text()
data_One.append(data)
# print(data)
for i in htmldata2:
data = i.get_text()
data_Two.append(data)
# print(data)
for i in htmldata3:
data = i.get_text()
data_Three.append(data)
# print(data)
i = 0
for d in data_One:
data_End.append(data_One[i] + " " + data_Two[i] + " " + data_Three[i] + '\n')
i += 1
# os.mkdir('d:\Paper')
# os.mknod('d:\Paper\Page1.txt')
fp = open("d:\Paper\Page"+ str(num) + ".txt", 'w')
fp.writelines(data_End)
fp.close()
def get_Page(html):
pos = html.find(name= 'li', attrs={'class':'next'}).find('a')
link = pos['href']
return link
response = requests.get(url, headers=headers)
html = BeautifulSoup(response.content, 'lxml')
get_Content(html, 0)
for i in range(1, 6):
link = 'http://www.*****.gov.cn' + get_Page(html)
response = requests.get(link, headers=headers)
html = BeautifulSoup(response.content, 'lxml')
link = 'http://www.*****.gov.cn' + get_Page(html)
get_Content(html,i)