https://ssr1.scrape.center/ 简单练习网站requests、selenium两种方式爬取

最新推荐文章于 2024-06-13 19:11:07 发布

qq_53401451

最新推荐文章于 2024-06-13 19:11:07 发布

阅读量1.8k

点赞数 3

分类专栏：爬虫文章标签： selenium python 爬虫 edge

本文链接：https://blog.csdn.net/qq_53401451/article/details/122071845

版权

爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

ssr1(电影数据网站，无反爬，T)

总结（requests实现）：

'''
1、/text()获取指定标签下的文本内容，//text()获取指定标签下的文本内容，包括子标签下的文本内容
这一点在标签数量不确定时用处较大
如每个电影的类型标签数量不一，而每个类型又位于html文档的不同标签里，这时可以将包含这些类型的大标签拿出来，然后读取文本内容，包括子标签下的

2、去除列表中的空格与换行
data_list = [x.strip() for x in temp_list if x.strip() != '']  ,其中temp_list是指列表名
eg
['\n        ', '    剧情   ', '\n        ', '\n爱情        ', '\n        ', '\n        ']
['剧情', '爱情']

3、对于空值要进行判断（否则遇到空值会报错，因为xpath此时无法定位到对象）
错误写法：
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0]
建议写法
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0] if len(node.xpath('./div[3]/p[1]/text()')) > 0 else None
'''

requests实现：

# coding = utf-8
# @Time : 2021/12/19
# url : 'https://ssr1.scrape.center/'

import requests
from lxml import etree
import time


class DIANying(object):

    def __init__(self):
        self.url = 'https://ssr1.scrape.center/page/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57'
        }

    def get_data(self, page):
        url = self.url + str(page)
        response = requests.get(url=url, headers=self.headers)
        time.sleep(3)    # 设置等待时间
        return response.content.decode()

    def parse_data(self, res, data_list):
        # 构建etree对象
        html = etree.HTML(res)
        # 定位节点
        node_list = html.xpath('//*[@id="index"]/div[1]/div[1]/div/div/div')
        # 遍历每个节点，提取信息
        for node in node_list:
            data_dict = {}
            data_dict['name'] = node.xpath('./div[2]/a/h2/text()')[0] if len(node.xpath('./div[2]/a/h2/text()')) > 0 else None
            type_set = node.xpath('./div[2]/div[1][@class="categories"]//text()') if len(node.xpath('./div[2]/div[1][@class="categories"]//text()')) > 0 else None
            # 去除列表中的空值
            film_type = [x.strip() for x in type_set if x.strip() != '']
            type_str = '、'.join(film_type)

            data_dict['type'] = type_str
            data_dict['area'] = node.xpath('./div[2]/div[2]/span[1]/text()')[0] if len(node.xpath('./div[2]/div[2]/span[1]/text()')) > 0 else None
            data_dict['long'] = node.xpath('./div[2]/div[2]/span[3]/text()')[0] if len(node.xpath('./div[2]/div[2]/span[3]/text()')) > 0 else None
            data_dict['begin_time'] = node.xpath('./div[2]/div[3]/span/text()')[0] if len(node.xpath('./div[2]/div[3]/span/text()')) > 0 else None
            data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0].strip() if len(node.xpath('./div[3]/p[1]/text()')) > 0 else None

            data_list.append(data_dict)

        return data_list

    def run(self):
        # 定义空列表，用于储存数据
        data_list = []
        # 依次访问每一页
        for page in range(1, 11):
            print('正在爬取第{}页，，，'.format(page))
            # 请求url，获取响应
            res = self.get_data(page)
            # 处理数据
            data_list = self.parse_data(res, data_list)

        print(data_list)
        print(len(data_list))


if __name__ == '__main__':
    dianying = DIANying()
    dianying.run()

总结（selenium实现）

'''
1、使用selenium进行爬虫，使用xpath进行定位时，如果某些标签缺失，会导致定位不到元素而报错，如何像在requests中使用xpath一样，增加判断条件，缺失的标签赋值为None
目前，我只会使用try/except来实现
如果确定是由于某些标签缺失而导致定位不到元素报错，则直接赋值为None

2、翻页时，怎么保证到最后一页就不进行翻页操作了，如果仍旧进行翻页操作，会由于定位不到“下一页”标签而报错
目前，我也只是使用try/except来实现
如果确定是由于到了最后一页而定位不到“下一页”标签报错，就直接终止
'''

selenium实现

# coding = utf-8
# @Time : 2021/12/
from selenium import webdriver
from selenium.webdriver.common.by import By


class DIANying(object):
    def __init__(self):
        self.url = 'https://ssr1.scrape.center/'
        self.driver = webdriver.Edge()

    def parse_data(self, data_list):
        node_list = self.driver.find_elements(By.XPATH, '//*[@id="index"]/div[1]/div[1]/div/div/div')
        for i in range(len(node_list)):
            try:
                data_dict = {}
                data_dict['name'] = node_list[i].find_element(By.XPATH, './div[2]/a/h2').text if len(node_list[i].find_element(By.XPATH, './div[2]/a/h2').text) > 0 else None
                type_set = node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text) > 0 else None
                data_dict['type'] = type_set
                data_dict['area'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text) > 0 else None
                data_dict['long'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text) > 0 else None
                data_dict['begin_time'] = node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text) > 0 else None
                data_dict['score'] = node_list[i].find_element(By.XPATH, './div[3]/p[1]').text.strip() if len(node_list[i].find_element(By.XPATH, './div[3]/p[1]').text) > 0 else None
            except:
                node_list = self.driver.find_elements(By.XPATH, '//*[@id="index"]/div[1]/div[1]/div/div/div')
                data_dict = {}
                data_dict['name'] = node_list[i].find_element(By.XPATH, './div[2]/a/h2').text if len(node_list[i].find_element(By.XPATH, './div[2]/a/h2').text) > 0 else None
                type_set = node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text) > 0 else None

                data_dict['type'] = type_set
                data_dict['area'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text) > 0 else None
                data_dict['long'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text) > 0 else None
                try:
                    data_dict['begin_time'] = node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text) > 0 else None
                except:
                    data_dict['begin_time'] = None
                data_dict['score'] = node_list[i].find_element(By.XPATH, './div[3]/p[1]').text.strip() if len(node_list[i].find_element(By.XPATH, './div[3]/p[1]').text) > 0 else None

            data_list.append(data_dict)

        # print(data_list)
        # 下一页操作
        try:
            self.driver.find_element(By.XPATH, '//*[@id="index"]/div[2]/div/div/div/a[@class="next"]/button').click()
        except:
            return data_list
        return data_list

    def run(self):
        data_list = []
        # 访问初始页面
        self.driver.get(self.url)
        # 处理数据
        for i in range(10):
            data_list = self.parse_data(data_list)


        self.driver.quit()
        print(data_list)
        print(len(data_list))


if __name__ == '__main__':
    dianying = DIANying()
    dianying.run()

qq_53401451

关注

3
点赞
踩
14

收藏

觉得还不错? 一键收藏
1
评论
https://ssr1.scrape.center/ 简单练习网站requests、selenium两种方式爬取

ssr1(电影数据网站，无反爬，T)总结（requests实现）：'''1、/text()获取指定标签下的文本内容，//text()获取指定标签下的文本内容，包括子标签下的文本内容这一点在标签数量不确定时用处较大如每个电影的类型标签数量不一，而每个类型又位于html文档的不同标签里，这时可以将包含这些类型的大标签拿出来，然后读取文本内容，包括子标签下的2、去除列表中的空格与换行data_list = [x.strip() for x in temp_list if x.strip() !=
复制链接

扫一扫

专栏目录