python xpath request json web
响应为json数据
import requests
from lxml import etree
import json
res={
'data':[]
}
header_data={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
url_id=[]
for i in range(1,3):
url_id.append(str(i))
for id in url_id:
url=r'http://infogate.jl.gov.cn/govsearch/jsonp/zf_jd_list.jsp?page={id}&lb=134657&callback=result&sword=&searchColumn=all&searchYear=all&pubURL=http%3A%2F%2Fxxgk.jl.gov.cn%2F&SType=1&searchYear=all&pubURL=&SType=1&channelId=134657&_=1651314237358'.format(id=id)
resp=requests.get(url,headers=header_data)
resp.encoding='utf-8'
text_content=resp.text
text_content=text_content[55:len(text_content)-1]
with open("./data/page.json", 'w', encoding='utf-8') as f:
print(text_content)
f.write(text_content)
all_info=json.loads(text_content)['data']
for i,x in enumerate(all_info):
in_data=x['tip']
cur={
'title':x['tip']['title'],
'time':x['tip']['dates']
}
res['data'].append(cur)
print(cur)
if i%50==0:
with open('./data/ind.json','w',encoding='utf-8') as f:
f.write(json.dumps(res,ensure_ascii=False))
with open('./data/ind.json','w',encoding='utf-8') as f:
f.write(json.dumps(res,ensure_ascii=False))
响应为源代码html
import requests
from lxml import etree
import json
res={
'data':[]
}
with open('./data/ind.json','r',encoding='utf-8') as f:
s=f.read()
res=json.loads(s)
header_data={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
url_id=['']
for i in range(7,22):
url_id.append('_'+str(i))
for id in url_id:
url='http://www.xizang.gov.cn/zwgk/xxfb/zbwj/index{id}.html'.format(id=id)
resp=requests.get(url,headers=header_data)
resp.encoding='utf-8'
text_content=resp.text
with open("./data/page.html", 'w', encoding='utf-8') as f:
print(text_content)
f.write(text_content)
parser=etree.HTMLParser(encoding='utf-8')
tree=etree.HTML(text_content,parser=parser)
title_list_pos='/html/body/div[2]/div/div[2]/div[2]/ul/li/a/text()'
time_list_pos='/html/body/div[2]/div/div[2]/div[2]/ul/li/span/text()'
time_list=tree.xpath(time_list_pos)
title_list=tree.xpath(title_list_pos)
for i,x in enumerate(time_list):
cur={
'title':title_list[i],
'time':x.strip()
}
res['data'].append(cur)
with open('./data/ind.json','w',encoding='utf-8') as f:
f.write(json.dumps(res,ensure_ascii=False))
直接使用模拟浏览器爬去
浏览器打开一个页面,然后获取这个页面信息,然后点击下一个页面,每次浏览器sleep 3秒
import requests
from lxml import etree
import json
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
def change(s):
s=s.replace('年','-')
s=s.replace('月','-')
return s[0:len(s)-1]
web = Chrome()
web.get('http://www.xinjiang.gov.cn/xinjiang/xjyw/common_list_99.shtml')
time.sleep(3)
res = {
'data': []
}
with open('./data/ind.json','r',encoding='utf-8') as f:
s=f.read()
res=json.loads(s)
titles_pos = '/html/body/div[4]/div[2]/div[2]/div[2]/ul/li/div/div[1]/a'
times_pos = '/html/body/div[4]/div[2]/div[2]/div[2]/ul/li/div/div[4]/span[1]'
for _ in range(130):
titles = web.find_elements(by=By.XPATH, value=titles_pos)
titles=titles[1:]
times_list = web.find_elements(by=By.XPATH, value=times_pos)
times_list=times_list[1:]
for i, x in enumerate(titles):
cur_time=times_list[i].text
cur_data = {
'title': x.text,
'time': cur_time
}
res['data'].append(cur_data)
if _ % 1 == 0:
with open('./data/ind.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(res, ensure_ascii=False))
nxt_tab_pos = '/html/body/div[4]/div[2]/div[2]/div[2]/div/div[1]/a[6]'
nxt_tab = web.find_element(by=By.XPATH, value=nxt_tab_pos).click()
time.sleep(3)
with open('./data/ind.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(res, ensure_ascii=False))
数据的处理
from lxml import etree
import json
import datetime
locate_name = './data/locate_data/hebei.json'
now_locate = "河北"
all_name = './data/all_info_time.json'
is_first = True
locate_info = {}
with open(locate_name, 'r', encoding='utf-8') as f:
locate_info = json.loads(f.read())
def fail_equal(a, b):
num = len(b)
ans = 0
for i in a:
if a.find(i) != -1:
ans = ans+1
if ans > num*0.5:
return False
else:
return True
def find_locate_time(policy_name, lim_d):
ans = 10000000
for item_policy in locate_info['data']:
if fail_equal(item_policy['title'], policy_name) == True:
continue
t = item_policy['time']+' 00:00:00'
cur_d = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
delt_day = (lim_d-cur_d).days
if delt_day < 0:
continue
ans = min(ans, delt_day)
if ans == 10000000 or ans > 400:
ans = -1
return ans
def work_for_tag():
all_info = {}
with open(all_name, 'r', encoding='utf-8') as f:
all_info = json.loads(f.read())
res = {}
final_ans_position = './data/find.json'
if is_first == False:
with open(final_ans_position, 'r', encoding='utf-8') as f:
res = json.loads(f.read())
sum_num = 0
for fr_name, fr_value in all_info.items():
if is_first == True:
res[fr_name] = {}
for se_name, se_value in fr_value.items():
if is_first == True:
res[fr_name][se_name] = {}
num = 0
sum_days = 0
for item_policy in se_value:
this_time = item_policy['time']+" 00:00:00"
d1 = datetime.datetime.strptime(this_time, '%Y-%m-%d %H:%M:%S')
d2=find_locate_time(item_policy['title'],d1,)
if d2==-1:continue
num+=1
sum_days+=d2
if num == 0:
continue
res[fr_name][se_name][now_locate] = {}
res[fr_name][se_name][now_locate]['num'] = num
res[fr_name][se_name][now_locate]['sum'] = sum_days
res[fr_name][se_name][now_locate]['avg'] = sum_days/num
print('sum_num', sum_num)
with open(final_ans_position, 'w', encoding='utf-8') as f:
f.write(json.dumps(res, ensure_ascii=False))
def work_for_time():
all_info = {}
with open(all_name, 'r', encoding='utf-8') as f:
all_info = json.loads(f.read())
res = {}
final_ans_position = './data/find.json'
if is_first == False:
with open(final_ans_position, 'r', encoding='utf-8') as f:
res = json.loads(f.read())
sum_num = 0
for fr_name, fr_value in all_info.items():
if is_first == True:
res[fr_name] = {}
for se_name, se_value in fr_value.items():
if is_first == True:
res[fr_name][se_name] = {}
num = 0
sum_days = 0
print(fr_name,se_name)
for item_policy in se_value:
sum_num = sum_num+1
this_time = item_policy['time']+" 00:00:00"
d1 = datetime.datetime.strptime(this_time, '%Y-%m-%d %H:%M:%S')
d2=find_locate_time(item_policy['title'],d1)
if d2==-1:continue
num+=1
sum_days+=d2
if num == 0:
continue
res[fr_name][se_name][now_locate] = {}
res[fr_name][se_name][now_locate]['num'] = num
res[fr_name][se_name][now_locate]['sum'] = sum_days
res[fr_name][se_name][now_locate]['avg'] = sum_days/num
print('num', num,'sum',sum_days)
with open(final_ans_position, 'w', encoding='utf-8') as f:
f.write(json.dumps(res, ensure_ascii=False))
work_for_time()
善用各种处理工具
import requests
from lxml import etree
import json
header_data={
"User-Agent": "Mozilla/5.0 (Window s NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
resp=requests.get(url,headers=header_data)
resp.encoding='utf-8'
time_list='/html/body/div[4]/div/div[1]/div[2]/div[1]/ul/li/span/text()'
title_list='/html/body/div[4]/div/div[1]/div[2]/div[1]/ul/li/a/text()'
text_content=resp.text
tree=etree.HTML(text_content)
titles=tree.xpath(title_list)
str='{}'
a=json.dumps(str,ensure_ascii=False)
import datetime
cur_d=datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
delt_day=(lim_d-cur_d).days
判断python字典是否有某个key
dict = {'name': '','age': '','sex': ''}
print('name' in dict)
print('id' in dict)
xpath的设定
获取文本: //标签1[@属性1="属性值1"]/标签2[@属性2="属性值2"]/.../text()
获取属性: //标签1[@属性1="属性值1"]/标签2[@属性2="属性值2"]/.../@属性
ul/li/a[not(@class)]
title_list[i].attrib['title']
多线程
from multiprocessing.pool import ThreadPool
num_processes = 8
my_pool = ThreadPool(processes=num_processes)
my_pool.apply_async(func=down_pic, args=(img_src, img_name))
my_pool.close()
my_pool.join()
添加cookie
cookies = {
'pxcts': '54925705-ab67-11ed-a31b-7649476e7656',
'_pxvid': '5492484b-ab67-11ed-a31b-7649476e7656',
'g_state': '{"i_l":0}',
'auth_secure': '__108a9d39dd896b874e8f%3B%22a133d7b127a62dfd8739cd2201f6bfd3%22',
'userinfo': '__d4ee4318a50dd8122201%3B%7B%22username%22%3A%22zhangmingdev%22%2C%22uniqueid%22%3A%22be5317d52a4f8efe1fabac1b4169bf74%22%2C%22dvs9-1%22%3A1%2C%22ab%22%3A%22tao-dpp-1-a-9%7Ctao-epp-1-a-9%7Ctao-rpb-1-a-6%7Ctao-btu-1-b-5%7Ctao-rpt-1-a-6%22%7D',
'auth': '__1809b3af7318fdc703e4%3B%22cef4ba424b10f8176d7212d7b5830be1%22',
'vd': '__4fc11d75a5dc1c177f16%3B%22Bj4b%5C%2F7%2CBkCt%5C%2FV%2CA%2CC%2CA%2C%2CB%2CA%2CB%2CBkCt%5C%2FV%2CBkCt%5C%2Fl%2CA%2CA%2CA%2CA%2C13%2CA%2CB%2CA%2CA%2CA%2CA%2CB%2CA%2CA%2C%22',
'td': '0:1785%3B6:1349x733%3B7:991%3B12:1143x1001%3B20:999',
}
headers = {
'authority': 'www.deviantart.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,und;q=0.6',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'referer': 'https://www.deviantart.com/topic',
'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
}
resp = requests.get(url,
timeout=timeout,
headers=headers,
cookies=cookies)
添加代理
socket.setdefaulttimeout(10)
class get_net:
def __init__(self):
self.proxy = {}
self.proxy_list = []
self.get_num = 0
self.max_get_num = 10
def get_proxy(self):
get_api = 'http://api.proxy.ipidea.io/getBalanceProxyIp?num=20&return_type=json&lb=1&sb=0&flow=1®ions=&protocol=https'
resp = requests.get(get_api,
proxies=self.proxy,
timeout=10,
stream=False,
headers=headers)
resp.encoding = 'utf-8'
self.proxy_list = json.loads(resp.text)['data']
self.proxy_len = len(self.proxy_list)
self.proxy_id = 0
def update_proxy(self):
if self.proxy_id == self.proxy_len:
self.get_proxy()
proxy_item = self.proxy_list[self.proxy_id]
self.proxy_id += 1
self.set_proxy(proxy_item['ip'], proxy_item['port'])
def check_update_proxy(self):
self.get_num += 1
if self.get_num == self.max_get_num:
self.update_proxy()
self.get_num = 0
def set_proxy(self, ip, port):
self.proxy = {
'http': 'http://'+ip+':'+str(port),
'https': 'http://'+ip+':'+str(port)
}
def get_page(self, url, timeout=10):
self.check_update_proxy()
try:
if 'http' not in self.proxy:
resp = requests.get(url, timeout=timeout,
headers=headers)
else:
resp = requests.get(url, proxies=self.proxy,
timeout=timeout, headers=headers)
resp.encoding = 'utf-8'
ans = resp.text
resp.close()
except Exception as e:
print('get error', repr(e))
return ans
def get_image(self, url, save_path, timeout=20):
self.check_update_proxy()
try:
if 'http' not in self.proxy:
resp = requests.get(url, timeout=timeout,
headers=headers)
else:
print('use proxy:', self.proxy)
resp = requests.get(url, proxies=self.proxy,
timeout=timeout, headers=headers)
if resp.status_code == 200:
print('download image', save_path)
with open(save_path, 'wb') as file:
file.write(resp.content)
resp.close()
except Exception as e:
print('get error', repr(e))
return resp
request_net = get_net()
request_net.get_proxy()
request_net.update_proxy()