xpath 语法运用实例【爬取boos】

一:url的处理

import urllib.request
from lxml import etree
def bo_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=url,headers=headers)
    res = urllib.request.urlopen(req)
    return res

二:爬取并匹配数据

def bo_spider(res):
    html = res.read()
    tree = etree.HTML(html)
    bo_list = tree.xpath('//div[@class="job-list"]//ul')
    bo_dict = {}
    for bo in bo_list:
        bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()')
        bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()')
        bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()')
        bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()')
        bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()')
        print(bo_dict)
    return bo_dict

三:用json存储匹配的数据

def xiazai(bo_dict):
    bo_list = json.dumps(bo_dict)
    with open("boos.json", 'a') as fp:
        fp.write(json.dumps(bo_list))
        fp.close()
    return bo_list

四:控制函数

def main():
    work = input("请输入你要爬取的岗位名称")
    url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position="
    # text = xiazai(bo_spider(bo_url(url)))
    text = bo_spider(bo_url(url))
    return text
if __name__ == '__main__':
    main()
五:整体代码

import json
import urllib.request
from lxml import etree
def bo_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"}
    req = urllib.request.Request(url=url,headers=headers)
    res = urllib.request.urlopen(req)
    return res

def bo_spider(res):
    html = res.read()
    tree = etree.HTML(html)
    bo_list = tree.xpath('//div[@class="job-list"]//ul')
    bo_dict = {}
    for bo in bo_list:
        bo_dict['职位类型'] = bo.xpath('./li//h3//div[@class="job-title"]/text()')
        bo_dict['待遇'] = bo.xpath('./li//h3//span[@class="red"]/text()')
        bo_dict['发布时间'] = bo.xpath('./li//div[@class="info-publis"]//p/text()')
        bo_dict['公司名称'] = bo.xpath('./li//div[@class="company-text"]//a/text()')
        bo_dict['地点'] = bo.xpath('./li//div[@class="info-primary"]//p/text()')
        print(bo_dict)
    return bo_dict

def xiazai(bo_dict):
    bo_list = json.dumps(bo_dict)
    with open("boos.json", 'a') as fp:
        fp.write(json.dumps(bo_list))
        fp.close()
    return bo_list

def main():
    work = input("请输入你要爬取的岗位名称")
    url = "https://www.zhipin.com/job_detail/?query=" + work + "%E7%88%AC%E8%99%AB&scity=101280600&industry=&position="
    text = xiazai(bo_spider(bo_url(url)))
    return text
if __name__ == '__main__':
    main()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值