- 今天领导临时布置了一个很紧急的任务,需要爬取网站数据!爬取任务就一个关键字:时间紧。
- 接到任务后,就开始着手爬取了,但是这个任务跟之前爬取普通的PDF不一样,因为PDF对应链接的页面没有下载点击按钮,按照之前的下载模板,得到的是一堆源码。看了下网页源码,显示:网页的download功能被屏蔽了。
- 前面是一堆废话,下面进入正文。
需求:
图一:爬取如下界面下的所有公司列表里面,每个公司点进去之后的PDF文档,以公司名称命名文件夹。每个公司的所有PDF放在该公司的文件夹下;
图二:某个公司对应的新闻页里面的PDF链接
图三:某个PDF链接进去之后的界面
包括模块:
- 公司列表页页面分析和信息获取
- 公司新闻页PDF链接获取、链接名称获取、时间获取、json文件的保存及json文件转化为csv文件
- 新闻页每个PDF链接里面的PDF下载
爬取结果:
步骤:
因为前一篇博客和这个有点像,都是动态加载的,所以有些重读的部分,就不再提了
1 列表页动态加载情况分析
这个列表页就是图1 公司列表获取页,大体上跟之前的网站没什么不一样的,但有点奇葩的是:
- 之前动态加载的网站,在Network——XHR——Preview里面的预览是可以看到响应内容的,但是这个网站是乱码的。不过还好,它的Response界面是可以看到的。
- post请求的结果是下图所示,类似于网站的那种,不是前一篇博客说的直接返回json格式的(都不用进一步数据),这里需要再利用etree对界面解析和xpath对获取的数据进行定位。
- 它的请求方式也是Post请求,请求头正常,需要添加的参数有:Cookie、 Referer、User-Agent、Host、 Origin;但是data是空的,不需要传参数
- 每一组li标签都是一个公司,上面是公司的中文,下面是公司的英文。其中a标签下面的id信息很重要,是每个公司请求新闻页的网址的一部分,所以,需要获取该id的内容
import requests
from lxml import etree
import os
import json
import csv
import collections # 有序字典
import pandas as pd
def save_mp3(mp3_path, mp3_name, url):
if not os.path.exists(mp3_path):
os.makedirs(mp3_path)
# save mp3
try:
resp = requests.get(url)
if resp.status_code == 200:
file_path = mp3_path + os.path.sep + '{mp3_name}'.format(mp3_name=mp3_name+'.pdf')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(resp.content)
print('Downloaded autio path is %s' % file_path)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to Save mp3,item %s' % mp3_name)
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
cookie1 = "JSESSIONID=3AA2EA0F3F36735DC7A381EE17C128E5; acw_tc=7b39758215549521874368504e99d8aa79e30b17cb54f80948c4090d5915cc; JSESSIONID=039F00EAA41E1B7BC48B31A3C5327ED6; _pk_ses.6.1152=1; _pk_id.6.1152=5be02b21272f0f1f.1554952187.1.1554953567.1554952187.; SERVERID=e058fcfed3b7b21b1212624284f4b183|1554953569|1554952187"
host1 = "icid.iachina.cn"
Referer = "http://icid.iachina.cn/ICID/"
Origin = "http://icid.iachina.cn"
headers1 = {'User-Agent': ua, 'Cookie': cookie1, 'Host': host1, 'Referer': Referer,'Origin': Origin }
url = "http://icid.iachina.cn/ICID/front/leafColComType.do?columnid=201510010001"
req = requests.post(url, headers=headers1)
# cmp_name= req.xpath("//div[@class='jie_nei']//li[@type_code='02'][1]")
# print(cmp_name)
# print(req.text)
req = etree.HTML(req.text)
cmp_name= req.xpath("//div[@class='jie_nei']//li[@type_code='03']", encoding = "utf-8" ,pretty_print = True, method = "html")
cmp_sx = req.xpath("//div[@class='jie_nei']//li[@type_code='03']/a")
# for each in cmp_sx:
# # print(each.attrib)
# # print(type(each.attrib))
# # print(each.xpath("./@id"))
# gongsi = {}
for i in range(0,len(cmp_name),2):
company_name = cmp_name[i][0].text
# print(cmp_name[i+1][0].text)
page_url = "http://icid.iachina.cn/ICID/front/getCompanyInfos.do?columnid=201510010001&comCode="+cmp_sx[i].xpath('./@id')[0]+"&attr=01"
req_list = requests.post(page_url, headers=headers1)
mm = etree.HTML(req_list.text)
list_name = mm.xpath("//div[@class='jie_nei']//a/text()")
list_id = mm.xpath("//div[@class='jie_nei']//a")
list_date = mm.xpath("//div[@class='jie_nei']//p[@class='kk']/text()")
pdf_info = {}
for j in range(len(list_name)):
pdf_info[list_name[j]] = list_date[j]
pdf_info[list_name[j]] = list_date[j]
url2 = "http://icid.iachina.cn/ICID/front/infoDetail.do?informationno="+list_id[j].xpath("./@id")[0]
req_pdf = requests.post(url2, headers=headers1)
nn = etree.HTML(req_pdf.text)
pdf = nn.xpath("//div[@class='pdf_a']//a")
download = "http://icid.iachina.cn/ICID/files/piluxinxi/pdf/"+pdf[0].xpath("./@id")[0]
save_path = "H:\\work\\BaoXian\\2"+'\\'+company_name
save_mp3(save_path,list_name[j],download)
print(pdf_info)
file_path = 'H:\\work\\BaoXian\\2\\'+ company_name
if not os.path.exists(file_path):
os.makedirs(file_path)
json_name = file_path + '\\' + company_name + '.json'
csv_name = file_path + '\\' + company_name + '.csv'
print(file_path)
###json文件保存
try:
with open(json_name, 'a+', encoding="utf-8") as fp:
fp.write(json.dumps(pdf_info, ensure_ascii=False) + "\n")
except IOError as err:
print('error' + str(err))
finally:
fp.close()
pass
###json转csv
inp = []
with open(json_name, "r", encoding="utf-8") as f:
line = f.readlines() # list 每个元素为字符串
for i in line:
inp.append(eval(i)) # 必须为字典才行
print(inp)
pd.DataFrame(inp).to_csv(csv_name)
2 公司新闻页PDF链接获取、链接名称获取、时间获取、json文件的保存及json文件转化为csv文件
这里其实没有什么难点了,对一个列表页的公司网址进行遍历,获取新闻页面的信息。基本上和之前的方法是一样的。小区别如下:
- headers信息保持不变,data依旧是空的
- 每个网址是由上面一步得到的id拼凑的
- json文件的保存和csv文件的转化,还是套用模板就行
3 新闻页每个PDF链接里面的PDF下载
需要爬取的文档界面为:
就是需求里面图3界面,获取该PDF即可,但是这里和之前不一样,链接是pdf后缀,但是爬取是pdfjs框架下的预览。如下图:
该界面对应的请求接口:即请求网址为
该PDF的界面根本没有下载按钮,按照之前的方法,拼凑好请求接口的URL获取之后是
http://icid.iachina.cn/front/infoDetail.do?informationno=2018042516095060
但其实它的网址是
http://icid.iachina.cn/ICID/files/piluxinxi/pdf/"+pdf[0].xpath("./@id")[0]
具体见如下博客:解释的很清楚
https://blog.csdn.net/For_GG/article/details/78616063
代码实现:
import requests
from lxml import etree
import os
import json
import csv
import collections # 有序字典
import pandas as pd
def save_mp3(mp3_path, mp3_name, url):
if not os.path.exists(mp3_path):
os.makedirs(mp3_path)
# save mp3
try:
resp = requests.get(url)
if resp.status_code == 200:
file_path = mp3_path + os.path.sep + '{mp3_name}'.format(mp3_name=mp3_name+'.pdf')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(resp.content)
print('Downloaded autio path is %s' % file_path)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to Save mp3,item %s' % mp3_name)
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
cookie1 = "JSESSIONID=3AA2EA0F3F36735DC7A381EE17C128E5; acw_tc=7b39758215549521874368504e99d8aa79e30b17cb54f80948c4090d5915cc; JSESSIONID=039F00EAA41E1B7BC48B31A3C5327ED6; _pk_ses.6.1152=1; _pk_id.6.1152=5be02b21272f0f1f.1554952187.1.1554953567.1554952187.; SERVERID=e058fcfed3b7b21b1212624284f4b183|1554953569|1554952187"
host1 = "icid.iachina.cn"
Referer = "http://icid.iachina.cn/ICID/"
Origin = "http://icid.iachina.cn"
headers1 = {'User-Agent': ua, 'Cookie': cookie1, 'Host': host1, 'Referer': Referer,'Origin': Origin }
url = "http://icid.iachina.cn/ICID/front/leafColComType.do?columnid=201510010001"
req = requests.post(url, headers=headers1)
# cmp_name= req.xpath("//div[@class='jie_nei']//li[@type_code='02'][1]")
# print(cmp_name)
# print(req.text)
req = etree.HTML(req.text)
cmp_name= req.xpath("//div[@class='jie_nei']//li[@type_code='03']", encoding = "utf-8" ,pretty_print = True, method = "html")
cmp_sx = req.xpath("//div[@class='jie_nei']//li[@type_code='03']/a")
for each in cmp_sx:
# print(each.attrib)
# print(type(each.attrib))
print(each.xpath("./@id"))
gongsi = {}
for i in range(0,len(cmp_name),2):
company_name = cmp_name[i][0].text
# print(cmp_name[i+1][0].text)
page_url = "http://icid.iachina.cn/ICID/front/getCompanyInfos.do?columnid=201510010001&comCode="+cmp_sx[i].xpath('./@id')[0]+"&attr=01"
req_list = requests.post(page_url, headers=headers1)
mm = etree.HTML(req_list.text)
list_name = mm.xpath("//div[@class='jie_nei']//a/text()")
list_id = mm.xpath("//div[@class='jie_nei']//a")
list_date = mm.xpath("//div[@class='jie_nei']//p[@class='kk']/text()")
pdf_info = {}
for j in range(len(list_name)):
pdf_info[list_name[j]] = list_date[j]
pdf_info[list_name[j]] = list_date[j]
url2 = "http://icid.iachina.cn/ICID/front/infoDetail.do?informationno="+list_id[j].xpath("./@id")[0]
req_pdf = requests.post(url2, headers=headers1)
nn = etree.HTML(req_pdf.text)
pdf = nn.xpath("//div[@class='pdf_a']//a")
download = "http://icid.iachina.cn/ICID/files/piluxinxi/pdf/"+pdf[0].xpath("./@id")[0]
save_path = "E:\\work\\ZZX\\res2"+'\\'+company_name
save_mp3(save_path,list_name[j],download)
file_path = 'E:\\work\\ZZX\\res2\\'+ company_name
if not os.path.exists(file_path):
os.makedirs(file_path)
json_name = file_path + '\\' + company_name + '.json'
csv_name = file_path + '\\' + company_name + '.csv'
print(file_path)
###json文件保存
try:
with open(json_name, 'a+', encoding="utf-8") as fp:
fp.write(json.dumps(pdf_info, ensure_ascii=False) + "\n")
except IOError as err:
print('error' + str(err))
finally:
fp.close()
pass
###json转csv
inp = []
with open(json_name, "r", encoding="utf-8") as f:
line = f.readlines() # list 每个元素为字符串
for i in line:
inp.append(eval(i)) # 必须为字典才行
print(inp)
pd.DataFrame(inp).to_csv(csv_name)