用xpath爬取单词 并写入表格
链接 https://www.shanbay.com/wordlist/110521/232414/
# 用xpath爬取单词 并写入表格
# 链接 https://www.shanbay.com/wordlist/110521/232414/
import requests
from lxml import etree
import xlwt
def write_to_excel(infos): # 创建workbook
workbook=xlwt.Workbook(encoding='utf-8')
sheet=workbook.add_sheet('word_list') # 添加一个sheet表单
head=[] # 先写表头
keys=infos[0].keys()
for key in keys: # 初始化表头
head.append(key)
# print(head)
# shell.write(
# i,行
# j,列
# content,写的内容
#
# )
# 写表头
for i in range(len(head)):
sheet.write(0,i,head[i])
i=1 # 行数,接下来从第一行开始写内容
for item in infos:
for j in range(len(head)):
sheet.write(i,j,item[head[j]])
i+=1
workbook.save('word2.xls') # 保存
print('写入成功!')
def get_text(text): # 判断是否为空方法
if text:
return text[0]
return ''
def parse_page(html_str):# 解析页面,提取数据
# print(html_str)
tree = etree.HTML(html_str) # 将html_str变成element对象
tr_list=tree.xpath('//tbody/tr')
# print(tr_list)
for tr in tr_list:
# F12点击箭头移动到你要的单词 点击会自动跳转,这时复制后xpath语法修正
# <td[@class="span2">此标签包含strong即加上/strong/text()
en=get_text(tr.xpath('.//td[@class="span2"]/strong/text()'))
# print(en) # 1,单词是英文 2,语义是中文
zh=get_text(tr.xpath('.//td[@class="span10"]/text()')) # 同上
# print(zh)
if all([en,zh]):# all函数接受一个list,表示这个list中的每一个元素都不为空时,all函数返回值才为True
item={}
item['en']=en
item['zh']=zh
print(item)
infos.append(item)
def main():
# base_url = '你搜索的网址'# 分页那儿0改成%s,方便后续
base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=%s'# 分页那儿0改成%s,方便后续
# %s
# headers={'User-Agent':'复制你搜索网页的F12'}
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
#3,实现分页
for i in range(1,2):
# 发送请求,获取数据
response=requests.get(base_url %(i),headers=headers)
parse_page(response.text)
if __name__ == '__main__':
infos=[]
main()
write_to_excel(infos)
未完待续……