Python爬虫工具,站长之家网站关键词采集及导出
#站长之家网站关键词采集
import requests
from lxml import etree
import xlsxwriter
def hqsj(url):
#url="http://rank.chinaz.com/?host=www.ugainian.com"
html=requests.get(url).text
#print(html)
nr=etree.HTML(html)
#print(nr)
keywords=nr.xpath('//div[@class="w25-0 tl pl10 pr pbimg showt"]/a/text()') #关键字
#print(keywords)
datas=nr.xpath('//div[@class="w8-0"]/a/text()')
#print(datas)
overall_index=datas[0::4] #整体指数
#overall_index=datas[::4])
pc_index=datas[1::4] #pc指数
mobile_index=datas[2::4] #移动指数
baidu_ranking=datas[3::4] #百度排名
#print(overall_index)
#print(pc_index)
#print(mobile_index)
#print(baidu_ranking)
collection=nr.xpath('//div[@class="w8-0 bor-r1s05"]/a/text()') #收录量
#print(collection)
title=[]
link=[]
title_datas=nr.xpath('//div[@class="R-home-w"]/a')
for title_data in title_datas:
#print(title_data.attrib)
title.append(title_data.attrib['title']) #网页标题
links=title_data.attrib['onclick'] #网页链接
links=links.replace('window.open',"")
links =links[2:]
links = links[:-2]
#print(links)
link.append(links)
#print(title)
#print(link)
for keywords,overall_index,pc_index,mobile_index,baidu_ranking,collection,title,link in zip (keywords,overall_index,pc_index,mobile_index,baidu_ranking,collection,title,link):
data=[keywords,overall_index,pc_index,mobile_index,baidu_ranking,collection,title,link]
#print(data)
return data
def hqlj(wz):
url=f"http://rank.chinaz.com/?host={wz}"
html=requests.get(url).text
#print(html)
nr=etree.HTML(html)
#print(nr)
page=nr.xpath('/html/body/form/div[2]/div[3]/div[1]/div[2]/span[1]/text()')
print(page)
ljs=[]
if not page:
ljs.append(url)
else:
page=page[0]
page=page.replace("共","")
page=page.replace("页,到第","")
page=int(page)
print(page)
for x in range(1,page+1):
lj=f'http://rank.chinaz.com/{wz}-0--0-{x}'
print(lj)
ljs.append(lj)
#print(ljs)
return ljs
if __name__ == '__main__':
wz=input("请输入网址链接:")
urls=hqlj(wz)
data_list = []
for url in urls:
#print(url)
dat=hqsj(url)
print(dat)
data_list.append(dat)
print(data_list)
workbook = xlsxwriter.Workbook('e:kami1.xlsx') # 创建一个Excel文件
worksheet = workbook.add_worksheet() # 创建一个sheet
title = ['关键字', '整体指数', 'pc指数', '移动指数', '百度排名', '收录量', '网页标题', '网页链接'] # 表格title
worksheet.write_row('A1', title) # title 写入Excel
for index, da in enumerate(data_list):
num0 = str(index + 2)
row = 'A' + num0
worksheet.write_row(row, da)
workbook.close()