工具
pycharm
目的
'''
网址:https://ljgk.envsc.cn/
需求:获取到地址(address),公司名字(ps_name),创建的时间(create_time),将数据分别保存在json文件和csv表格
'''
代码
# 动态数据,刷新网页后,有一个包含所有数据的XHR
import requests
import json
import csv
class Spider():
# 初始化,text_type代表要返回的数据类型:0text,1content,2json
def __init__(self, url, headers, file_name, data=None, json=None, text_type=0):
self.list = []
self.url = url
self.headers = headers
self.data = data
self.json = json
self.text_type = text_type
self.file_name = file_name # 要保存的文件名称,不包含后缀
# 请求数据
def get_data(self):
response = requests.get(self.url, headers=self.headers, data=self.data, json=self.json)
# 动态数据返回的是json格式
if self.text_type == 0:
return response.text
elif self.text_type == 1:
return response.content
elif self.text_type == 2:
return response.json()
# 数据处理(之后补充其他情况的代码)
def parse_data(self, text):
# json数据处理
for i in text:
list_element = {}
list_element['ps_name'] = i['ps_name']
list_element['address'] = i['address']
list_element['create_time'] = i['create_time']
self.list.append(list_element)
# print(self.list)
# 数据保存
def save_data(self):
with open('{}.json'.format(self.file_name),'w',encoding='utf8') as f:
json.dump(self.list,f,ensure_ascii=False,indent=2)
with open('{}.csv'.format(self.file_name),'a',encoding='utf8',newline='') as f:
fieldnames = ['ps_name', 'address', 'create_time']
csv_writer = csv.DictWriter(f,fieldnames=fieldnames)
csv_writer.writeheader()
# for list_element in self.list:
# csv_writer.writerow(list_element)
csv_writer.writerows(self.list)
# 入口函数
def run(self):
text = self.get_data()
self.parse_data(text)
self.save_data()
if __name__ == '__main__':
url = 'https://ljgk.envsc.cn/OutInterface/GetPSList.ashx?regionCode=0&psname=' \
'&SystemType=C16A882D480E678F&sgn=2c887fad3076724ffd70d22320308a5d7b501610' \
'&ts=1691844481490&tc=11515962'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
file_name = 'companies'
spider = Spider(url, headers, file_name, text_type=2)
spider.run()
运行结果
见资源