第一部分
造网址
import requests,pandas as pd,numpy as np,re,time,json
from lxml import etree
res1=requests.get('http://liuyan.people.com.cn/forum/list?fid=41')
res=requests.get('http://liuyan.people.com.cn/threads/content?tid=9548781')
html=etree.HTML(res.text)
html1=etree.HTML(res1.text)
res1=requests.get('http://liuyan.people.com.cn/forum/list?fid=41')
html1=etree.HTML(res1.text)
t1=['http://liuyan.people.com.cn'+html1.xpath('/html/body/div[5]/div[4]/ul/li['+str(i)+']/b/a/@href')[0] for i in range(1,13)]
t1
#省长和市长
shangzhang=['http://liuyan.people.com.cn/threads/list?fid=591','http://liuyan.people.com.cn/threads/list?fid=592','http://liuyan.people.com.cn/threads/list?fid=5173','http://liuyan.people.com.cn/threads/list?fid=5174','http://liuyan.people.com.cn/threads/list?fid=4454']#陕西省委书记,省长
shizhang=['http://liuyan.people.com.cn/threads/list?fid='+str(i) for i in range(1367,1386)]
t2=[]
for i in t1:
res2=requests.get(i)
html2=etree.HTML(res2.text)
t2.extend(html2.xpath('/html/body/div[5]/div[4]/ul/li/b/a/@href'))
xianzhang=['http://liuyan.people.com.cn/'+i for i in t2]
ren=shangzhang+shizhang+xianzhang
ren_fid=[i.split('=')[1] for i in ren]
第二部分
from selenium import webdriver
import time
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
list_=list(pd.read_excel('C:\\Users\\admin\\Desktop\\陕西领导.xlsx')[0])
cdd=[]
def second(url=None):
a=webdriver.Chrome()
a.get('http://liuyan.people.com.cn/threads/list?fid='+str(url))
b=a.find_element_by_xpath('//*[@id="show_more"]')
while b.click is not None:
try:
b.click()
time.sleep(2)
except:
break
i=1
while i>0:
try:
cdd.append(b.find_element_by_xpath('//*[@id="list_content"]/li['+str(i)+']/h2/b/a').get_attribute('href'))
i=i+1
except:
break
a.close()
return cdd
pool = ThreadPool(10)
pool.map(second,list_)
pool.close()
pool.join()
cdd=set(cdd)
len(cdd)
#pd.DataFrame(cdd).to_csv('C:\\Users\\admin\\Desktop\\陕西详情页网址.csv')
第三部分
import requests,pandas as pd,numpy as np,re,time,json
from lxml import etree
cd=pd.read_csv('C:\\Users\\admin\\Desktop\\陕西详情页网址.csv')
cd=cd['0']
b=[]
def tri_part(url=None):
res=requests.get(url)
html=etree.HTML(res.text)
Municipal_level=html.xpath('/html/body/div[4]/a[3]/text()')#市级
district_and_county=html.xpath('/html/body/div[4]/i/text()')[0]#[0].split('区')+'区'#区县
Message_title=html.xpath('/html/body/div[6]/h2/b/span/text()')[0]#留言题目
Processing_status=html.xpath('/html/body/div[6]/h2/b/em[1]/text()')[0]#办理状态
Message_field=html.xpath('/html/body/div[6]/h2/b/em[2]/text()')[0]#留言领域
Message_type=html.xpath('/html/body/div[6]/h2/b/em[3]/text()')[0]#留言类型
Message_time=html.xpath('/html/body/div[6]/h3/span/text()')[0][28:44]#留言时间
Message_content=html.xpath('/html/body/div[6]/p/text()')[0]#留言内容
Reply_unit=html.xpath('/html/body/div[9]/ul/li[1]/h3[1]/i/text()')#回复单位
Reply_content=html.xpath('/html/body/div[9]/ul/li[1]/p/text()')#回复内容
Reply_time=html.xpath('/html/body/div[9]/ul/li[1]/h3[2]/em/text()')#回复时间
Comprehensive_evaluation=html.xpath('/html/body/div[9]/ul/li[2]/h4[1]/span/div/text()')#综合评价
Degree_of_solution=html.xpath('/html/body/div[9]/ul/li[2]/h4[1]/span/span[1]/span/text()')#解决程度
Handling_attitude=html.xpath('/html/body/div[9]/ul/li[2]/h4[1]/span/span[2]/span/text()')#办理态度
Processing_speed=html.xpath('/html/body/div[9]/ul/li[2]/h4[1]/span/span[3]/span/text()')#办理速度
Evaluation_details=html.xpath('/html/body/div[9]/ul/li[2]/p/text()')#评价详情
Evaluation_time=html.xpath('/html/body/div[9]/ul/li[2]/h4[2]/em/text()')#评价时间
print(url)
b.append([Municipal_level,district_and_county,Message_title,Processing_status,Message_field,Message_type,Message_time,Message_content,Reply_unit,Reply_content,Reply_time,Comprehensive_evaluation,Degree_of_solution,Handling_attitude
,Processing_speed,Evaluation_details,Evaluation_time])
time.sleep(5)
pool = ThreadPool(10)
pool.map(tri_part,cd)
pool.close()
pool.join()
columns=['市级','区县','留言题目','办理状态','留言领域','留言类型','留言时间','留言内容','回复单位','回复内容','回复时间','综合评价','解决程度','办理态度','办理速度','评价详情','评价时间']
columns_dict={}
for i in range(len(columns)):
columns_dict[i]=columns[i]
end_df=pd.DataFrame(b).rename(columns=columns_dict)
end_df
end_df['市级'][0]
end_df.to_excel('C:\\Users\\admin\\Desktop\\留言板详细数据.xlsx')
第四部分
data=pd.read_excel('C:\\Users\\admin\\Desktop\\留言板详细数据.xlsx')
data[i]
for i in data.columns:
data[i]=data[i].astype('str')
data[i]=data[i].apply(lambda x: re.sub('\[\]|\[|\]|\r|\xa0|\n','',x))
print(i)
data
def time1(t):
return t[:10]+' '+t[10:]
l=data['回复时间'][2][:10]+data['回复时间'][2][10:]
l
time1(l)
data['留言时间']=data['留言时间'].fillna('')
for i in range(len(data['留言时间'])):
data['留言时间'][i]=time1(data['留言时间'][i])
data['留言时间']
data['回复时间']=data['回复时间'].fillna('')
for i in range(len(data['回复时间'])):
data['回复时间'][i]=time1(data['回复时间'][i])
data['回复时间']
data['评价时间']=data['评价时间'].fillna('')
for i in range(len(data['评价时间'])):
data['评价时间'][i]=time1(data['评价时间'][i])
data['评价时间']
data.to_excel('C:\\Users\\admin\\Desktop\\陕西省领导留言板详细数据.xlsx')