主要是联系了urlib库的使用,通过parse进行解析数据。
实名感谢
opensourceChina
# -*- coding: UTF-8 -*-
__author__ = 'zy'
__time__ = '2019/3/1 21:05'
#//*[@id="ctl00"]/table/tbody/tr[2]/td/table/tbody/tr[2]/td[2]/a/text()
from lxml import etree
def clean(text):
text=text.replace('\r' ,'')
text =text.replace('\n' ,'')
text =text.replace('\r\n', '')
text =text.replace("space","")
#text =text.replace('; ','')
text =text.strip()
return text
from urllib import parse
def get_detail_url(url):
urldata=parse.urlparse(url)
result=parse.parse_qs(urldata.query)
f_n=result['FileName'][0] #由于返回的是数组
d_c=result['DbCode'][0]
d_n=result['DbName'][0]
#拼接url也有更好的方法
#query = {"name": "walker", "age": 99} d = parse.urlencode(query)
data={
'dbcode':d_c,
'dbname':d_n,
'filename':f_n
}
detail_url='http://kns.cnki.net/KCMS/detail/detail.aspx?'+parse.urlencode(data)
#detail_url='http://kns.cnki.net/KCMS/detail/detail.aspx?dbcode='+d_c+'&dbname='+d_n+'&filename='+f_n
return detail_url
html = etree.parse('try_zhiwang.html', etree.HTMLParser())
all_paper='//table[@class="GridTableContent"]//tr'
nu_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[1]/text()'
tm_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[2]/a//text()'
au_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[3]//text()'#''.join(ls2)
or_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[4]/a/text()'
ti_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[5]/text()'
da_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[6]/text()'
url_xpath='//table[@class="GridTableContent"]//tr[{num}]/td[2]/a/@href'
pages=html.xpath(all_paper)
print(len(pages))
Data1=[]
Data2=[]
Data3=[]
Data4=[]
Data5=[]
Data6=[]
Data7=[]
for i in range(len(pages)):
if i!=0:
i=i+1
#print(i)
nu=html.xpath(nu_xpath.format(num=i))
#print(nu[0])
Data1.append(nu[0])
tm=html.xpath(tm_xpath.format(num=i))
tm = ''.join(tm)
Data2.append(tm)
#print(tm_xpath.format(num=i))
au = html.xpath(au_xpath.format(num=i))
au=''.join(au)#孬孬思考,这个是不是可以统一的进行的进行粘接然后再进行除了按clean处理呢
print(clean(au))
Data3.append(clean(au))
or_ = html.xpath(or_xpath.format(num=i))
Data4.append(or_[0])
ti = html.xpath(ti_xpath.format(num=i))
Data5.append(clean(ti[0]))
da = html.xpath(da_xpath.format(num=i))
Data6.append(clean(da[0]))
url = html.xpath(url_xpath.format(num=i))
Data7.append(get_detail_url(url[0]))
#print('抓取文献{j}个'.format(j=i)+'序号为'+nu[0]+'题名为'+tm[0]+'作者为'+clean(au)+'来源'+or_[0]+'时间'+ti[0]+'数据库'+da[0])
import pandas as pd
import xlwt
#http://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CJFDTEMP&filename=XDQB201903007
#<a class="fz14" href="/kns/detail/detail.aspx?QueryID=2&CurRec=1&recid=&FileName=ZJJG20190301004&DbName=CAPJLAST&DbCode=CJFQ&yx=Y&pr=&URLID=33.1273.Z.20190301.1555.002" target="_blank">目的论命名的“神话”特征及其背后的历史动因</a>
url_details='http://kns.cnki.net/KCMS/detail/{details}.html'#33.1273.Z.20190301.1555.002
df1 = pd.DataFrame({'Data1': Data1,
'Data2': Data2,
'Data3': Data3,
'Data4': Data4,
'Data5': Data5,
'Data6': Data6,
'Data7': Data7})
# df2 = pd.DataFrame({'Data2': Data2})
# df3 = pd.DataFrame({'Data3': Data3})
# df4 = pd.DataFrame({'Data4': Data4})
# df5 = pd.DataFrame({'Data5': Data5})
# df6 = pd.DataFrame({'Data6': Data6})
#All = pd.DataFrame[df1,df2,df3,df4,df5,df6]
df1.to_excel('a.xls',encoding='utf-8', index=False, header=False)
# writer=pd.ExcelWriter('testk.xlsx')
# df1.to_excel(writer,sheet_name='Data1',startcol=0,index=False)
# df2.to_excel(writer,sheet_name='Data1',startcol=1,index=False)
#df3.to_excel(writer,sheet_name='Data1')
# create a Pandas Excel writer using xlswriter
#tr[position()>1] 这个是排序