import requests
from lxml import etree
import os
import re
# 获取目标url
base =
response = requests.get()
response.encoding = 'utf-8'
html = etree.HTML(response.text)
# 获取文字
x = html.xpath('/html/body/div/div[4]/div[1]/div[2]/ul/li')
novel_path = '会计学词汇中英文对照' + os.path.sep
for url in x:
print(url.xpath('.//a/@href'))
print(url.xpath('.//a/text()'))
name = url.xpath('.//a/text()')[0] + '.txt'
# 拼接目标url
m_url = base + url.xpath('.//a/@href')[0]
# 具体爬取
# for 循环url
i = 0
while 1:
i = i + 1
# 拼接url
if i == 1:
url = m_url
else:
url = m_url.split('.html')[0] + '-' + str(i) + '.html'
try:
response = requests.get(url)
response.encoding = 'utf-8'
html = etree.HTML(response.text)
# 获取文字
x = html.xpath('/html/body/div/div[4]/div[1]/div[2]/div/p/text()')
print(x)
print()
with open(name, 'a', encoding='utf-8') as f:
for item in x:
temp = re.sub('[a-zA-Z0-9!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’!^_`{|}~+()]', '', item)
temp = temp.replace(' ', '\n')
temp = temp.replace(';','\n')
if(temp!=''):
f.write(temp),
f.write('\n')
print(name + "中写入" + temp)
# 跳出
if len(x) == 0:
with open(name,'r',encoding='utf-8') as fr,open('word'+name,'a',encoding='utf-8' ) as fd:
for text in fr.readlines():
if text.split():
fd.write(text)
print('输出成功....')
os.remove(name)
break
except Exception:
print('爬取结束!')
break
python xpath的简单应用
最新推荐文章于 2024-04-07 08:00:00 发布