逆向爬虫06 bs4，xpath，pyquery实战

最新推荐文章于 2023-12-06 14:07:06 发布

一个小黑酱

最新推荐文章于 2023-12-06 14:07:06 发布

阅读量600

点赞数 1

分类专栏：爬虫学习文章标签：爬虫 python 数据挖掘

本文链接：https://blog.csdn.net/weixin_40743639/article/details/122155253

版权

爬虫学习专栏收录该内容

45 篇文章 16 订阅

订阅专栏

逆向爬虫06 bs4，xpath，pyquery实战

原本想要详细的对比一下这三个模块的使用方法，但是在实战的时候发现，只要遵循一个套路，抓取静态网页(即网页信息直接放在html源代码中)，就比较容易了，一些使用细节上的问题，每个人遇到的都会不一样，只有自己实实在在去练习了，才能掌握这三个工具。

套路就是遵循路飞学城逆向爬虫课程第二章数据解析中，《xpath实战案例_猪八戒》《pyquery实战案例》两节视频中的方法，懒得说了，大家自己慢慢练吧，这玩意儿光看不练是没用的。

下面直接上代码，仅供学习使用，代码可能具有时效性，过段时间若html代码结构发生变化，就不好用了。

bs4抓取猪八戒招标大厅的外包任务信息

"""
    目标：爬猪八戒招标大厅中的 价格，任务标题，任务详情
    url: https://task.zbj.com/page1.html
"""
import requests
from bs4 import BeautifulSoup
import time

def get_html_source(url):
    resp = requests.get(url)
    resp.encoding = "utf-8"
    # with open("source.html", mode="w", encoding="utf-8") as f:
    #     f.write(resp.text)
    return resp.text

def get_data_from_html(html):
    page = BeautifulSoup(html, "html.parser")
    div_list = page.find_all("div", attrs={"class": "result-search-item"})
    # with open("result-search-item.html", mode="w", encoding="utf-8") as f:
    #     f.write(str(div_list))
    for div in div_list:
        h4 = div.find("h4")
        work_title = h4.get("title")
        div_detail = div.find("div", attrs={"class": "pub-desc text-line-overflow-two"})
        work_detail = div_detail.text
        span_price = div.find("span", attrs={"class": "price"})
        price = span_price.text
        work_detail = work_detail.replace("\n", "")
        # print(f"{price},{work_title},{work_detail}")
        with open("result.csv", mode="a", encoding="utf-8") as f:
            f.write(f"{price},{work_title},{work_detail}\n")

if __name__ == "__main__":
    for i in range(3400):	# 根据招标大厅下面的总页数来填写
        url = f"https://task.zbj.com/page{i+1}.html"
        html = get_html_source(url)
        get_data_from_html(html)
        time.sleep(3)
    print("猪八戒招标大厅信息爬取完成！")

xpath抓取程序员客栈的程序员信息

"""
    目标：爬程序员客栈上程序员的信息
    url: https://www.proginn.com/cat/page/1/
"""

import requests
from lxml import etree
import time

def get_html_source(url):
    headers = {
        # 添加一个请求头信息UA，如果没有请求头，目标服务器会拒绝我们访问，这是一个最简单的反爬手段，只需要在http请求头中添加浏览器信息，就可以骗过目标服务器。
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57"
    }
    resp = requests.get(url, headers=headers)
    resp.encoding = "utf-8"
    # with open("source.html", mode="w", encoding="utf-8") as f:
    #     f.write(resp.text)
    return resp.text

def get_data_from_html(html):
    et = etree.HTML(html)
    user_info = '/html/body/div[@class="main"]/div[@class="main_body"]/div/div[@class="ui divided items proginn-user-list"]/div[@class="item J_user"]/div[@class="user-info fl"]'
    name_list = et.xpath(user_info + '/div[@class="title"]/a/span/text()')
    skill_list = et.xpath(f"{user_info}/div[2]/p[2]/span/text()|{user_info}/div[2]/p[2]/span[not(text())]")
    workspace_list = et.xpath(f"{user_info}/div[2]/div/div[1]/span[2]/text()|{user_info}/div[2]/div/div[1]/span[not(text())]")
    worktime_list = et.xpath(user_info + '/div[2]/div/div[2]/span[2]/text()')
    salary_list = et.xpath('/html/body/div[@class="main"]/div[@class="main_body"]/div/div[@class="ui divided items proginn-user-list"]/div[@class="item J_user"]/div[@class="hire-info fl"]/p[1]/span/text()')
    href = et.xpath(user_info + '/div[@class="title"]/a/@href')
    for idx in range(15):
        detail_html = get_html_source(href[idx])
        detail_et = etree.HTML(detail_html)
        try:
            detail = detail_et.xpath('/html/head/meta[@name="description"]/@content')[0]
            detail = detail.replace("\n","")
            detail = detail.replace("-&nbsp;","")
            detail = detail.replace("&nbsp;","")
        except Exception as e:
            print("没有详情！")
        if type(skill_list[idx]) != type(salary_list[idx]):
            skill_list[idx] = "无"
        else:
            skill_list[idx] = skill_list[idx].replace(","," ")
        if type(workspace_list[idx]) != type(salary_list[idx]):
            workspace_list[idx] = "无"
        
        print(f"{salary_list[idx]},{workspace_list[idx]},{worktime_list[idx]},{name_list[idx]},{skill_list[idx]}")
        with open("程序员客栈程序员信息.csv", mode="a", encoding="utf-8") as f:
            f.write(f"{salary_list[idx]},{workspace_list[idx]},{worktime_list[idx]},{name_list[idx]},{skill_list[idx]},{detail}\n")
    
if __name__ == "__main__":
    for i in range(1,101):		# 根据程序员客栈下面的总页数来填写
        url = f"https://www.proginn.com/cat/page/{i}/"
        html = get_html_source(url)
        get_data_from_html(html)

pyquery抓取猎聘网爬虫岗位信息

"""
    目标：爬猎聘网站上的爬虫岗位信息
    url: https://www.liepin.com/zhaopin/?headId=1bd035b6a73e295eaafa5aedf960fe32&ckId=23fhmys0ecze35t8oork8bqoa4zydf9a&oldCkId=1bd035b6a73e295eaafa5aedf960fe32&fkId=tonyue22m6ifnzptvbka94m9o3x1nyha&skId=tonyue22m6ifnzptvbka94m9o3x1nyha&sfrom=search_job_pc&key=%E7%88%AC%E8%99%AB&currentPage=0&scene=page
"""
from pyquery import PyQuery
import requests
import time

def get_html_source(url):
    headers = {
        # 添加一个请求头信息UA，如果没有请求头，目标服务器会拒绝我们访问，这是一个最简单的反爬手段，只需要在http请求头中添加浏览器信息，就可以骗过目标服务器。
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57"
    }
    resp = requests.get(url, headers=headers)
    resp.encoding = "utf-8"
    # with open("source.html", mode="w", encoding="utf-8") as f:
    #     f.write(resp.text)
    return resp.text

def get_data_from_html(html):
    # 加载html内容
    doc = PyQuery(html)
    doc = doc(".left-list-box > ul:nth-child(1) li").items()
    for item in doc:
        work_href = item("div.job-detail-box > a:nth-child(1)").attr("href")
        job_name = item("div.job-title-box > div:nth-child(1)").text()
        area = item("div.job-title-box > div:nth-child(2) > span:nth-child(2)").text()
        salary = item("span.job-salary").text()
        gener_skill = item("span.labels-tag").items()
        request = []
        for skill in gener_skill:
            request.append(skill.text())
        request = "|".join(request)
        company_href = item("div.job-detail-box > a:nth-child(2)").attr("href")
        company_name = item("span.company-name").text()
        area_people = []
        spans = item("div.company-tags-box > span").items()
        for span in spans:
            area_people.append(span.text())
        area_people = " ".join(area_people)
        result = f"{salary},{area},{job_name},,{request},{company_name},{area_people},{work_href},{company_href}"
        with open("猎聘爬虫岗位信息.csv", mode="a", encoding="utf-8") as f:
            f.write(f"{result}\n")
        print(result)

if __name__ == "__main__":
    print("pyquery execise")
    for i in range(10):		# 猎聘只能查10页
        url = f"https://www.liepin.com/zhaopin/?headId=1bd035b6a73e295eaafa5aedf960fe32&ckId=23fhmys0ecze35t8oork8bqoa4zydf9a&oldCkId=1bd035b6a73e295eaafa5aedf960fe32&fkId=tonyue22m6ifnzptvbka94m9o3x1nyha&skId=tonyue22m6ifnzptvbka94m9o3x1nyha&sfrom=search_job_pc&key=%E7%88%AC%E8%99%AB&currentPage={i}&scene=page"
        html = get_html_source(url)
        get_data_from_html(html)