Python使用xpath爬取51job

最新推荐文章于 2024-09-12 15:54:27 发布

May_Xu_

最新推荐文章于 2024-09-12 15:54:27 发布

阅读量868

点赞数 1

文章标签： python xpath 多线程

本文链接：https://blog.csdn.net/May_Xu_/article/details/107005729

版权

为了更快捷，使用多线程爬取

import requests
from lxml import etree
import threading

def get_request(page):
    url = "https://search.51job.com/list/190200,000000,0000,00,9,99,Python,2,"+str(page)+".html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
    headers = {
        "User-Agent":"头部信息"
    }
    res = requests.get(url,headers=headers)
    res.encoding = res.apparent_encoding
    return res.text

def get_content(respon):
    html = etree.HTML(respon).xpath('//div[@class="dw_table"]//div[@class="el"]')
    print(html)
    for value in html:
        try:
            position = value.xpath('./p/span/a/text()')[0].strip()
            link = value.xpath('./p/span/a/@href')[0]
            company = value.xpath('./span/a/text()')[0]
            address = value.xpath('./span[@class="t3"]/text()')[0]
            wage = value.xpath('./span[@class="t4"]/text()')
            updataTime = value.xpath('./span[@class="t5"]/text()')[0]
            if len(wage)<1:
                wage = "null"
            else:
                wage = wage[0]
            print("职位链接：",link)
            print("职位名：",position)
            print("公司名：",company)
            print("工作地点：",address)
            print("工资：",wage)
            print("发布时间：",updataTime)
            print("-"*30)
        except Exception as e:
            print("发生错误")
            print(e)

def main(page):
    print("第",page,"页",end="")
    try:
        respon = get_request(page)
        get_content(respon)
        print("爬取成功")
    except Exception as e:
        print("爬取失败")
        print(e)

def run1():
    for page in range(1,12,2):
        main(page)
def run2():
    for page in range(2,12,2):
        main(page)

if __name__ == '__main__':
    t1 = threading.Thread(target=run1,args=())
    t2 = threading.Thread(target=run2,args=())
    t1.start()
    t2.start()
    t1.join()
    t2.join()