爬虫练习1-看崔老师书+自己领悟

借用崔老师的网站自己写 xpath分析



import multiprocessing

import requests
from lxml import etree
import json
from os import makedirs
from os.path import exists
url = 'https://ssr1.scrape.center/'


def index_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        html = response.text
        return etree.HTML(html)


def get_url(page):
    baseurl = f'{url}page/{page}'
    print(baseurl)
    return index_url(baseurl)


def detail_url(html):
    detailurl = html.xpath('//img[@class="cover"]/../@href')
    for detail in detailurl:
        details = f'https://ssr1.scrape.center{detail}'
        print('deatail urls', details)
        yield details


def detail_html(url):
    return index_url(url)


def scrape_detail(html):
    cover = html.xpath('//img/@src')[1]
    name = html.xpath('//h2[@class = "m-b-sm"]/text()')[0]
    categories = html.xpath('//div[@class="categories"]//span/text()')
    country_time_deputy = html.xpath('//div[@class="m-v-sm info"]/span/text()')
    country = country_time_deputy[0]
    time = country_time_deputy[2]
    try:
        deputy = country_time_deputy[3]
    except IndexError:
        deputy = '空'

    drama = html.xpath('//div[@class="drama"]/p/text()')[0].strip()
    score = html.xpath('//p[contains(@class,"score")]/text()')[0].strip()
    return {
        '封面': cover,
        '名字': name,
        '类型': categories,
        '地区': country,
        '时长': time,
        '上映时间': deputy,
        '内容简介': drama,
        '分数': score

    }
result_file = 'contxt'
exists(result_file) or makedirs(result_file)
def save_data(data):
    name = data.get('名字')
    data_path = f'{result_file}/{name}.json'
    json.dump(data,open(data_path,'w',encoding='utf-8'),ensure_ascii=False,indent=2)

def main(page):

    indexurl = get_url(page)
    detailurl = detail_url(indexurl)
    for detail in detailurl:
        detailhtml = detail_html(detail)
        data = scrape_detail(detailhtml)
        save_data(data)
        print(data)


if __name__ == '__main__':
    pool = multiprocessing.Pool()
    page = range(1,11)
    pool.map(main,page)
    pool.close()
    pool.join()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值