仅供个人学习笔记使用。
from lxml import etree
parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('招聘网站.html',parser=parser)
#1、获取所有li标签
# //li
# xpath函数返回的是一个列表
trs = html.xpath('//li')
for li in trs:
print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
# 2、获取第2个li标签
li = html.xpath('//li[1]')[0]
print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
# #3、获取所有class等于con_list_item的li标签
trs = html.xpath("//li[@class='con_list_item first_row default_list']")
for li in trs:
print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
# #4、获取所有a标签的href属性
aList = html.xpath('//a[@href]')
for a in aList:
print(etree.tostring(a, encoding='utf-8').decode('utf-8'))
# #5、获取所有的职位信息(纯文本)
trs = html.xpath('//li[position()>0]')
positions =[]
for li in trs:
# 在某个标签下,再执行xpath函数,获取这个标签下的子孙元素,那么应该在//之前加一个点,代表当前元素下获取
url = li.xpath('.//a/@href')[0]
position = li.xpath(".//h3/text()")[0]
salary = li.xpath(".//span[@class='money']/text()")[0]
job_exper = li.xpath(".//div[@class='li_b_l']/text()")[2]
zhaopin_date = li.xpath(".//span[@class='format-time']/text()")[0]
company = li.xpath(".//div/a[@data-lg-tj-id='8F00']/text()")[0]
company_size = li.xpath(".//div[@class='industry']/text()")[0]
introduce = li.xpath(".//div[@class='li_b_r']/text()")[0]
position = {
'网址':url.strip(),
'公司地址':position.strip(),
'薪资':salary.strip(),
'工作经验':job_exper.strip(),
'招聘日期':zhaopin_date.strip(),
'公司名称':company.strip(),
'公司规模':company_size.strip(),
'公司介绍':introduce.strip()
}
positions.append(position)
print(positions)