day23-爬虫3-PyQuery以及xpath

day23

总结

  • # Pyquery
    
    from pyquery import PyQuery
    
    # 1. 创建PyQuery对象
    # PyQuery对象的本质是一个容器
    # pd对应的容器中只有一个html标签
    # pq = PyQuery('需要解析的hml字符串')
    
    with open(r'./temp.html', 'r', encoding='utf-8') as f:
        html = f.read()
    
    pq = PyQuery(html)
    print(type(pq))  # <class 'pyquery.pyquery.PyQuery'>
    
    # 2. 通过选择器获取子标签
    # PyQuery对象(选择器)
    p_pq = pq('p')
    print(type(p_pq))  # <class 'pyquery.pyquery.PyQuery'>
    print(p_pq)
    
    a_pq = pq('a')
    print(type(a_pq))  # <class 'pyquery.pyquery.PyQuery'>
    # print(a_pq)
    
    div_pq = pq('div')
    # print(div_pq)
    print(type(div_pq[0]))  # <class 'lxml.html.HtmlElement'>
    
    li = pq('#listAll > div.layout-Module-container.layout-Cover.ListContent > ul > li')
    title = li('div > a > div.DyListCover-content > div:nth-child(1) > h3')
    
    # 3. 获取标签中的内容
    # PyQuery对象.text()
    print(title.text())
    
    # 4. 获取标签属性
    # PyQuery对象.attr(属性名)
    
    result = li.attr('class')
    print(result)       # layout-Cover-item 获取第一个li标签的class属性
    print('='*40)
    
    # 获取所有li标签的class属性
    for l in li:
        print(PyQuery(l).attr('class'))
    
    
  • # lxml操作
    
    from lxml import etree
    
    # 1. 构建指定的数结构并且获取根节点
    html = etree.HTML(open('temp.html', encoding='utf-8').read())
    
    # 2. 根据路径获取根节点的路径
    res = html.xpath('/html/body')
    print(res)      # [<Element body at 0x216b0079500>]
    
    # 3. 路径
    # 绝对路径: /绝对路径
    # 注意:绝对路径必须从树结构的根节点开始写(html树结构就是从html开始写)
    res = html.xpath('/html/head')
    print(res)      # [<Element head at 0x17b37a98780>]
    
    
    # 相对路径:./相对路径
    # 注意:相对路径从该节点开始,.   -   表示当前节点, ..   -   表示父节点
    meta = res[0].xpath('./meta')
    print(meta)
    
    # 4. 获取文本值/text()
    # 例如:
    # span_texts = html.xpath('//span/text()')
    
    # 5. 从指定路径下任意位置开始查找
    span = html.xpath('//span')
    print(span)
    for s in span:
        print(s.xpath('./text()'))
    
    # 6. 获取属性
    # 获取属性  -   @属性名
    for s in span:
        print(s.xpath('./@class'))
    
    ids = html.xpath('//@id')
    print(ids)
    
    
    # 7 谓语  -   筛选条件
    # 1) [N]    -   第N个
    result = html.xpath('/html/body//a[1]/@href')
    print(result)
    
    # 2) [last()]   -   最后一个
    #  [last() - N] -   倒数第N+1个
    
    # 3) [position()<3]     -   前两个[1, 3)
    # [position()<N]    -   前N-1个 [1, N)
    # [position()>N]    -   [N, ∞)
    
    # 8. [@属性名] -   拥有固定属性
    result = html.xpath('//a[@class]')
    print(result)
    
    # 9. [@属性名=值]   -   指定固定属性的值条件可以是=, 也可以是>=, <=, >, <
    
    
    # 10. * - 通配符,可以匹配到任何标签,任意属性
    res = html.xpath('./body/*')
    print(res)
    
    attrs = html.xpath('//a/@*')
    print(attrs)
    
    # 11. |     -       或者
    res = html.xpath('./body/*|./body/div/span')
    res = html.xpath('./body/*/div/text()|./body/div/a/text()')
    
    
    

作业

# 爬斗鱼

import requests
from bs4 import BeautifulSoup
import json
import csv
import selenium
from selenium.webdriver import Chrome


def get_net_data(_url, _headers={}, _params={}):
    try:
        res = requests.get(_url, headers=_headers, params=_params)
        if res.status_code == 200:
            return res.text
        print(res)
    finally:
        res.close()


def get_json_data(_url, _headers={}, _params={}):
    try:
        res = requests.get(_url, headers=_headers, params=_params)
        if res.status_code == 200:
            return res.json()
        print(res)
    finally:
        res.close()


def analysis_data(html: str):
    soup = BeautifulSoup(html, 'lxml')
    print(type(soup))
    lis = soup.select('#listAll > div.layout-Module-container.layout-Cover.ListContent > ul > li')
    for li in lis:
        print(type(li))
        title = li.select_one('div > a > div.DyListCover-content > div:nth-child(1) > h3')
        print('title', title.get_text())
        anchor = li.select_one('div > a > div.DyListCover-content > div:nth-child(2) > h2 > div')
        print('anchor', anchor.get_text())
        hot = li.select_one('div > a > div.DyListCover-content > div:nth-child(2) > span > svg')
        print('hot', hot)
        tag = li.select_one('div > a > div.DyListCover-content > span')
        print('tag', tag)


if __name__ == '__main__':
    # url = 'https://www.douyu.com/g_LOL'
    # all_list_data = []
    # next_url = 'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/1'
    # hearders = {
    #     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36'
    # }
    # for x in range(1, 11):
    #     next_url = f'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/{x}'
    #     py_obj = get_json_data(next_url, hearders)
    #     # print(py_obj['data']['rl'])
    #     all_list_data.extend(py_obj['data']['rl'])
    #
    # with open(r'./lol.json', 'w', encoding='utf-8') as f:
    #     f.write(json.dumps(all_list_data))
    # analysis_data(html)

    with open(r'./lol.json', 'r', encoding='utf-8') as f:
        py_obj = json.loads(f.read())

    origin = 'https://www.douyu.com'
    new_list = []
    for obj in py_obj:
        one_data = [obj['rn'], obj['nn'], obj['ol'], f"{origin}{obj['url']}", obj['rs1'], obj['c2name']]
        new_list.append(one_data)
    with open(r'./lol.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['anchor', 'tag', 'hot', 'url', 'img', 'game'])
        writer.writerows(new_list)

# 爬豆瓣

import requests
from lxml import etree
import re
import csv


# 'https://movie.douban.com/top250?start=0&filter='

class CrawpDoubanTop250():
    def __init__(self, headers={}, params={}):
        self.url = 'https://movie.douban.com/top250?start=%s&filter='
        self.headers = headers
        self.params = params

    def get_urls(self, page):
        return self.url % page

    def parse_url(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def get_content(self, page):
        url = self.get_urls(page)
        content = self.parse_url(url)
        return content

    def analysis_data(self, content):
        html = etree.HTML(content)
        lis = html.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
        all_info = []
        # 观察网页发现部分电影有些数据没有,列表不会存为空值,列表中数据不能对应,所以抛弃
        # res = li.xpath('./div/div[2]/div[1]/a/span[1]/text()|./div/div[2]/div[1]/a/span[2]/text()|./div/div[2]/div[1]/a/span[3]/text()|./div/div[1]/a/img/@src|./div/div[2]/div[1]/a/@href|./div/div[2]/div[2]/p[1]/text()|./div/div[2]/div[2]/div/*/text()|./div/div[2]/div[2]/p[2]/span/text()')
        for li in lis:
            list_res = []
            # res = li.xpath('./div/div[2]/div[1]/a/span[1]/text()|./div/div[2]/div[1]/a/span[2]/text()')
            res = li.xpath('./div/div[2]/div[1]/a/span[position()<3]/text()')
            list_res.extend(res)
            # 由网页发现该数据有的电影没有,如果没有那么列表中获取不到,用户列表形式存为csv时,数据不能完全对应,因此需要单独提出来进行判断,如没有需要在列表中填充空字符串
            res2 = li.xpath('./div/div[2]/div[1]/a/span[3]/text()')
            list_res.extend(res2 if res2 else [''])
            res3 = li.xpath('./div/div[1]/a/img/@src|./div/div[2]/div[1]/a/@href|./div/div[2]/div[2]/p[1]/text()|./div/div[2]/div[2]/div/*/text()')
            list_res.extend(res3)
            # 同res2情况
            res4 = li.xpath('./div/div[2]/div[2]/p[2]/span/text()')
            list_res.extend(res4 if res4 else [''])
            new_res = []
            for item in list_res:
                item = re.sub(r'(^\s*/?)?\s+', ' ', item).strip()
                new_res.append(item)
            all_info.append(new_res)
        return all_info


if __name__ == '__main__':
    all_data = []
    top25 = CrawpDoubanTop250()
    for x in range(0, 226, 25):
        data = top25.get_content(x)
        all_data.extend(top25.analysis_data(data))

    with open(r'./top25.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['title', 'other_title', 'other', 'img', 'url', 'director', 'movie_info', 'grade', 'number', 'subhead'])
        writer.writerows(all_data)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值