借用崔老师的网站自己写 xpath分析 import multiprocessing import requests from lxml import etree import json from os import makedirs from os.path import exists url = 'https://ssr1.scrape.center/' def index_url(url): response = requests.get(url) if response.status_code == 200: html = response.text return etree.HTML(html) def get_url(page): baseurl = f'{url}page/{page}' print(baseurl) return index_url(baseurl) def detail_url(html): detailurl = html.xpath('//img[@class="cover"]/../@href') for detail in detailurl: details = f'https://ssr1.scrape.center{detail}' print('deatail urls', details) yield details def detail_html(url): return index_url(url) def scrape_detail(html): cover = html.xpath('//img/@src')[1] name = html.xpath('//h2[@class = "m-b-sm"]/text()')[0] categories = html.xpath('//div[@class="categories"]//span/text()') country_time_deputy = html.xpath('//div[@class="m-v-sm info"]/span/text()') country = country_time_deputy[0] time = country_time_deputy[2] try: deputy = country_time_deputy[3] except IndexError: deputy = '空' drama = html.xpath('//div[@class="drama"]/p/text()')[0].strip() score = html.xpath('//p[contains(@class,"score")]/text()')[0].strip() return { '封面': cover, '名字': name, '类型': categories, '地区': country, '时长': time, '上映时间': deputy, '内容简介': drama, '分数': score } result_file = 'contxt' exists(result_file) or makedirs(result_file) def save_data(data): name = data.get('名字') data_path = f'{result_file}/{name}.json' json.dump(data,open(data_path,'w',encoding='utf-8'),ensure_ascii=False,indent=2) def main(page): indexurl = get_url(page) detailurl = detail_url(indexurl) for detail in detailurl: detailhtml = detail_html(detail) data = scrape_detail(detailhtml) save_data(data) print(data) if __name__ == '__main__': pool = multiprocessing.Pool() page = range(1,11) pool.map(main,page) pool.close() pool.join()