针对部分无法获取数据,在xpath语句中使用“descendant-or-self::text()”,即可获取同级及下级的信息
个人常用xpath爬虫格式:
import requests
from lxml import etree
from fake_useragent import UserAgent
import urllib
from xlrd import open_workbook
from xlutils.copy import copy
#设置headers
ua = UserAgent(verify_ssl=False)
headers = {
"User-Agent": ua.random,
}
#获取url链接的xml格式
def getxml(url):
res = requests.get(url, headers, timeout = 30)
res.encoding = res.apparent_encoding
text = res.text
xml = etree.HTML(text)
return xml
#获取详细信息
urllink= 'https://www.tianyancha.com/elibs_quoted/p'
for i in range(1,270):
url = urllink + str(i)
print(url)
xml = getxml(url)
eles = xml.xpath('//div[@class="elib-table"]//tbody/tr/td/descendant-or-self::text()')
#写入excel
rexcel = open_workbook("D:/名单.xls")
excel = copy(rexcel)
table = excel.get_shee