import requests import re import logging from urllib.parse import urljoin import json from os import makedirs from os.path import exists import multiprocessing #logging.basicConfig(level=logging.INFO, #format='%(asctime)s - %(levelname)s:%(message)s') def scrape_page(url): # 获取页面内容 logging.info('scraping %s...', url) try: response = requests.get(url) if response.status_code == 200: return response.text #logging.error('get invalid status code %s while scraping %s', response.status_code, url) except requests.RequestException: pass #logging.error('error occurred while scraping %s', url, exc_info=True) def scrape_index(page): # 拼接网址 index_url = f'{BASE_URL}/page/{page}' return scrape_page(index_url) def parse_index(html): # 解析页面内容 pattern = re.compile('<a.*?href="(.*?)".*?class="name">') items = re.findall(pattern, html) if not items: return [] for item in items: detail_url = urljoin(BASE_URL, item) logging.info('get datail url %s', detail_url) yield detail_url def scrape_detail(url): #获取详情页的内容 return scrape_page(url) def parse_detail(html): #解析详情页 cover_pattern = re.compile('class="item.*?<img.*?src="(.*?)".*?class="cover">', re.S) cover = re.search(cover_pattern, html).group(1) if re.search(cover_pattern, html) else None name_pattern = re.compile('<h2.*?>(.*?)</h2>') name = re.search(name_pattern, html).group(1) if re.search(name_pattern, html) else None categories_pattern = re.compile('<button.*?category.*?<span>(.*?)</span>', re.S) categories = re.findall(categories_pattern, html) if re.findall(categories_pattern, html) else None published_at_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s?上映') published_at = re.search(published_at_pattern, html).group(1) if re.search(published_at_pattern, html) else None drama_pattern1 = re.compile('<div .*?><h3 .*?>(.*?)</h3>', re.S) drama1 = re.search(drama_pattern1, html).group(1) if re.search(drama_pattern1, html) else None drama_pattern2 = re.compile('<div.*?drama.*?>.*?<p.*?>(.*?)</p>', re.S) drama2 = re.search(drama_pattern2, html).group(1).strip() if re.search(drama_pattern2, html) else None drama = drama1 + ":" + drama2 score_pattern = re.compile('<p.*?score.*?>(.*?)</p>', re.S) score = re.search(score_pattern, html).group(1).strip() if re.search(score_pattern, html) else None return { 'cover':cover, 'name':name, 'categories':categories, 'published_at':published_at, 'drama':drama, 'score':score } def save_data(data): name = data.get('name') data_path = f'{RESULT_DIR}/{name}.json' json.dump(data, open(data_path, "w",encoding="utf-8"),ensure_ascii=False,indent=2) def main(): for page in range(1, TOTAL_PAGE + 1): index_html = scrape_index(page) #拼接每页 detail_urls = parse_index(index_html) #详情页列表 for detail_url in detail_urls: detail_html = scrape_page(detail_url) #获取单个详情页内容 data = parse_detail(detail_html) #解析单个详情页列表 #logging.info('get detail data %s', data) #print(data) save_data(data) print('Save data succcessfully') BASE_URL = 'http://ssr1.scrape.center' TOTAL_PAGE = 10 RESULT_DIR = 'results' exists(RESULT_DIR) or makedirs(RESULT_DIR) if __name__ == "__main__": main()
第二章 基础爬虫案例实战
最新推荐文章于 2024-07-24 10:38:58 发布