爬虫-解析数据

import requests
#pip install beautifulsoup4
from bs4 import BeautifulSoup
from lxml import etree
from pdb import set_trace
class ddooo:
    def __init__(self):
        res=self.爬取源代码()
        #self.用xpath解析数据(res)
        self.用bs解析数据(res)
    def 用xpath解析数据(self,res):
        html=etree.HTML(res) #初始化
        li=[]
        for i in range(1,21):
            #p=f'//*[@id="dnrj"]/ul/li[{i}]/div[1]/p/a/text()'
            p=f'//*[@id="dnrj"]/ul/li[{i}]/div[1]/p/a/@href'
            li.append(html.xpath(p))
        return li
    def 用bs解析数据(self,res):
        html=BeautifulSoup(res,'lxml') #初始化
        x=html.find_all(attrs={"class":"pic"})
        li=[]
        for i in x:
            li.append(i.text)
            #print(i.text)
            #print(i.get('href'))
            #print(type(i))
            
        '''
        初始化后的源代码.find_all(
            attrs={
                "name":'a'      "class"="pic"  
                }
            )
        '''
        return li
    def 爬取源代码(self):
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            "sec-ch-ua": "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\""
        }
        cookies = {
            "UM_distinctid": "18eb34b98a52f9-064823b28d2454-26001a51-1fa400-18eb34b98a6989",
            "CNZZDATA1281127966": "1317709386-1712405060-%7C1713836543"
        }
        url = "https://search.ddooo.com/search.html"
        params = {
            "wd": "qq"
        }
        response = requests.get(url, headers=headers, cookies=cookies, params=params).text
        return response
if __name__=='__main__':    
    f=ddooo()
    res=f.爬取源代码()
    xpath=f.用xpath解析数据(res)
    bs=f.用bs解析数据(res)
    print(xpath)
    print()
    print(bs)

  • 29
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值