逆向爬虫06 bs4,xpath,pyquery实战

逆向爬虫06 bs4,xpath,pyquery实战

​ 原本想要详细的对比一下这三个模块的使用方法,但是在实战的时候发现,只要遵循一个套路,抓取静态网页(即网页信息直接放在html源代码中),就比较容易了,一些使用细节上的问题,每个人遇到的都会不一样,只有自己实实在在去练习了,才能掌握这三个工具。

​ 套路就是遵循路飞学城逆向爬虫课程第二章数据解析中,《xpath实战案例_猪八戒》《pyquery实战案例》两节视频中的方法,懒得说了,大家自己慢慢练吧,这玩意儿光看不练是没用的。

​ 下面直接上代码,仅供学习使用,代码可能具有时效性,过段时间若html代码结构发生变化,就不好用了。

bs4抓取猪八戒招标大厅的外包任务信息
"""
    目标:爬猪八戒招标大厅中的 价格,任务标题,任务详情
    url: https://task.zbj.com/page1.html
"""
import requests
from bs4 import BeautifulSoup
import time

def get_html_source(url):
    resp = requests.get(url)
    resp.encoding = "utf-8"
    # with open("source.html", mode="w", encoding="utf-8") as f:
    #     f.write(resp.text)
    return resp.text

def get_data_from_html(html):
    page = BeautifulSoup(html, "html.parser")
    div_list = page.find_all("div", attrs={"class": "result-search-item"})
    # with open("result-search-item.html", mode="w", encoding="utf-8") as f:
    #     f.write(str(div_list))
    for div in div_list:
        h4 = div.find("h4")
        work_title = h4.get("title")
        div_detail = div.find("div", attrs={"class": "pub-desc text-line-overflow-two"})
        work_detail = div_detail.text
        span_price = div.find("span", attrs={"class": "price"})
        price = span_price.text
        work_detail = work_detail.replace("\n", "")
        # print(f"{price},{work_title},{work_detail}")
        with open("result.csv", mode="a", encoding="utf-8") as f:
            f.write(f"{price},{work_title},{work_detail}\n")

if __name__ == "__main__":
    for i in range(3400):	# 根据招标大厅下面的总页数来填写
        url = f"https://task.zbj.com/page{i+1}.html"
        html = get_html_source(url)
        get_data_from_html(html)
        time.sleep(3)
    print("猪八戒招标大厅信息爬取完成!")
xpath抓取程序员客栈的程序员信息
"""
    目标:爬程序员客栈上程序员的信息
    url: https://www.proginn.com/cat/page/1/
"""

import requests
from lxml import etree
import time

def get_html_source(url):
    headers = {
        # 添加一个请求头信息UA,如果没有请求头,目标服务器会拒绝我们访问,这是一个最简单的反爬手段,只需要在http请求头中添加浏览器信息,就可以骗过目标服务器。
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57"
    }
    resp = requests.get(url, headers=headers)
    resp.encoding = "utf-8"
    # with open("source.html", mode="w", encoding="utf-8") as f:
    #     f.write(resp.text)
    return resp.text

def get_data_from_html(html):
    et = etree.HTML(html)
    user_info = '/html/body/div[@class="main"]/div[@class="main_body"]/div/div[@class="ui divided items proginn-user-list"]/div[@class="item J_user"]/div[@class="user-info fl"]'
    name_list = et.xpath(user_info + '/div[@class="title"]/a/span/text()')
    skill_list = et.xpath(f"{user_info}/div[2]/p[2]/span/text()|{user_info}/div[2]/p[2]/span[not(text())]")
    workspace_list = et.xpath(f"{user_info}/div[2]/div/div[1]/span[2]/text()|{user_info}/div[2]/div/div[1]/span[not(text())]")
    worktime_list = et.xpath(user_info + '/div[2]/div/div[2]/span[2]/text()')
    salary_list = et.xpath('/html/body/div[@class="main"]/div[@class="main_body"]/div/div[@class="ui divided items proginn-user-list"]/div[@class="item J_user"]/div[@class="hire-info fl"]/p[1]/span/text()')
    href = et.xpath(user_info + '/div[@class="title"]/a/@href')
    for idx in range(15):
        detail_html = get_html_source(href[idx])
        detail_et = etree.HTML(detail_html)
        try:
            detail = detail_et.xpath('/html/head/meta[@name="description"]/@content')[0]
            detail = detail.replace("\n","")
            detail = detail.replace("- ","")
            detail = detail.replace(" ","")
        except Exception as e:
            print("没有详情!")
        if type(skill_list[idx]) != type(salary_list[idx]):
            skill_list[idx] = "无"
        else:
            skill_list[idx] = skill_list[idx].replace(","," ")
        if type(workspace_list[idx]) != type(salary_list[idx]):
            workspace_list[idx] = "无"
        
        print(f"{salary_list[idx]},{workspace_list[idx]},{worktime_list[idx]},{name_list[idx]},{skill_list[idx]}")
        with open("程序员客栈程序员信息.csv", mode="a", encoding="utf-8") as f:
            f.write(f"{salary_list[idx]},{workspace_list[idx]},{worktime_list[idx]},{name_list[idx]},{skill_list[idx]},{detail}\n")
    
if __name__ == "__main__":
    for i in range(1,101):		# 根据程序员客栈下面的总页数来填写
        url = f"https://www.proginn.com/cat/page/{i}/"
        html = get_html_source(url)
        get_data_from_html(html)
        
pyquery抓取猎聘网爬虫岗位信息
"""
    目标:爬猎聘网站上的爬虫岗位信息
    url: https://www.liepin.com/zhaopin/?headId=1bd035b6a73e295eaafa5aedf960fe32&ckId=23fhmys0ecze35t8oork8bqoa4zydf9a&oldCkId=1bd035b6a73e295eaafa5aedf960fe32&fkId=tonyue22m6ifnzptvbka94m9o3x1nyha&skId=tonyue22m6ifnzptvbka94m9o3x1nyha&sfrom=search_job_pc&key=%E7%88%AC%E8%99%AB&currentPage=0&scene=page
"""
from pyquery import PyQuery
import requests
import time

def get_html_source(url):
    headers = {
        # 添加一个请求头信息UA,如果没有请求头,目标服务器会拒绝我们访问,这是一个最简单的反爬手段,只需要在http请求头中添加浏览器信息,就可以骗过目标服务器。
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57"
    }
    resp = requests.get(url, headers=headers)
    resp.encoding = "utf-8"
    # with open("source.html", mode="w", encoding="utf-8") as f:
    #     f.write(resp.text)
    return resp.text

def get_data_from_html(html):
    # 加载html内容
    doc = PyQuery(html)
    doc = doc(".left-list-box > ul:nth-child(1) li").items()
    for item in doc:
        work_href = item("div.job-detail-box > a:nth-child(1)").attr("href")
        job_name = item("div.job-title-box > div:nth-child(1)").text()
        area = item("div.job-title-box > div:nth-child(2) > span:nth-child(2)").text()
        salary = item("span.job-salary").text()
        gener_skill = item("span.labels-tag").items()
        request = []
        for skill in gener_skill:
            request.append(skill.text())
        request = "|".join(request)
        company_href = item("div.job-detail-box > a:nth-child(2)").attr("href")
        company_name = item("span.company-name").text()
        area_people = []
        spans = item("div.company-tags-box > span").items()
        for span in spans:
            area_people.append(span.text())
        area_people = " ".join(area_people)
        result = f"{salary},{area},{job_name},,{request},{company_name},{area_people},{work_href},{company_href}"
        with open("猎聘爬虫岗位信息.csv", mode="a", encoding="utf-8") as f:
            f.write(f"{result}\n")
        print(result)

if __name__ == "__main__":
    print("pyquery execise")
    for i in range(10):		# 猎聘只能查10页
        url = f"https://www.liepin.com/zhaopin/?headId=1bd035b6a73e295eaafa5aedf960fe32&ckId=23fhmys0ecze35t8oork8bqoa4zydf9a&oldCkId=1bd035b6a73e295eaafa5aedf960fe32&fkId=tonyue22m6ifnzptvbka94m9o3x1nyha&skId=tonyue22m6ifnzptvbka94m9o3x1nyha&sfrom=search_job_pc&key=%E7%88%AC%E8%99%AB&currentPage={i}&scene=page"
        html = get_html_source(url)
        get_data_from_html(html)

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Python爬虫中的bs4xpath是两种常用的数据提取工具。 bs4(Beautiful Soup 4)是一个基于Python的库,用于解析HTML和XML文档。它能够帮助我们从网页中提取数据并进行处理。bs4提供了一些简单且易于使用的方法,例如通过标签名、类名、属性等进行查找和筛选数据。 下面是一个简单的使用bs4进行数据提取的例子: ```python from bs4 import BeautifulSoup import requests # 发送HTTP请求获取页面内容 url = "http://example.com" response = requests.get(url) html_content = response.content # 使用bs4解析页面内容 soup = BeautifulSoup(html_content, 'html.parser') # 提取数据 title = soup.title.text print("网页标题:", title) # 查找某个标签并获取其文本内容 h1 = soup.find("h1") print("h1标签内容:", h1.text) # 查找所有的链接并输出链接文本和URL links = soup.find_all("a") for link in links: print("链接文本:", link.text) print("链接URL:", link["href"]) ``` 另一方面,XPath是一种用于选择XML文档中节点的语言。在爬虫中,我们可以使用XPath来从HTML或XML文档中提取数据。XPath提供了强大且灵活的选择器,可以使用路径表达式来定位节点。 下面是一个使用XPath进行数据提取的示例: ```python import requests from lxml import etree # 发送HTTP请求获取页面内容 url = "http://example.com" response = requests.get(url) html_content = response.content # 使用lxml解析页面内容 tree = etree.HTML(html_content) # 提取数据 title = tree.xpath("//title/text()")[0] print("网页标题:", title) # 查找某个标签并获取其文本内容 h1 = tree.xpath("//h1/text()")[0] print("h1标签内容:", h1) # 查找所有的链接并输出链接文本和URL links = tree.xpath("//a") for link in links: link_text = link.xpath("text()")[0] link_url = link.xpath("@href")[0] print("链接文本:", link_text) print("链接URL:", link_url) ``` 以上就是使用bs4XPath进行数据提取的示例代码。希望能帮助到你!如有需要,请随时追问。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值