爬虫练习1-看崔老师书+自己领悟_爬虫崔佬搭建的练习网站-CSDN博客

本文链接：https://blog.csdn.net/weixin_69806373/article/details/126683873

借用崔老师的网站自己写 xpath分析



import multiprocessing

import requests
from lxml import etree
import json
from os import makedirs
from os.path import exists
url = 'https://ssr1.scrape.center/'


def index_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        html = response.text
        return etree.HTML(html)


def get_url(page):
    baseurl = f'{url}page/{page}'
    print(baseurl)
    return index_url(baseurl)


def detail_url(html):
    detailurl = html.xpath('//img[@class="cover"]/../@href')
    for detail in detailurl:
        details = f'https://ssr1.scrape.center{detail}'
        print('deatail urls', details)
        yield details


def detail_html(url):
    return index_url(url)


def scrape_detail(html):
    cover = html.xpath('//img/@src')[1]
    name = html.xpath('//h2[@class = "m-b-sm"]/text()')[0]
    categories = html.xpath('//div[@class="categories"]//span/text()')
    country_time_deputy = html.xpath('//div[@class="m-v-sm info"]/span/text()')
    country = country_time_deputy[0]
    time = country_time_deputy[2]
    try:
        deputy = country_time_deputy[3]
    except IndexError:
        deputy = '空'

    drama = html.xpath('//div[@class="drama"]/p/text()')[0].strip()
    score = html.xpath('//p[contains(@class,"score")]/text()')[0].strip()
    return {
        '封面': cover,
        '名字': name,
        '类型': categories,
        '地区': country,
        '时长': time,
        '上映时间': deputy,
        '内容简介': drama,
        '分数': score

    }
result_file = 'contxt'
exists(result_file) or makedirs(result_file)
def save_data(data):
    name = data.get('名字')
    data_path = f'{result_file}/{name}.json'
    json.dump(data,open(data_path,'w',encoding='utf-8'),ensure_ascii=False,indent=2)

def main(page):

    indexurl = get_url(page)
    detailurl = detail_url(indexurl)
    for detail in detailurl:
        detailhtml = detail_html(detail)
        data = scrape_detail(detailhtml)
        save_data(data)
        print(data)


if __name__ == '__main__':
    pool = multiprocessing.Pool()
    page = range(1,11)
    pool.map(main,page)
    pool.close()
    pool.join()