豆瓣网络触手项目

这是一个可改进的触手程序,存在可能无法写入的问题这有可能是程序逻辑的问题,或线程结束后。统一写入的问题,程序运行容易被检测到封掉ip (建议网上购买可用ip)

经过改良后可实现万条豆瓣电影详细数据爬取。

下面是代码: 需要有一定的python基础: 像我这样的学生可以尝试拿去自己需要部分代码做测试学习。

import pprint
from bs4 import BeautifulSoup
import requests
import random
import time
import pandas as pd
import concurrent.futures
import queue
import threading
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Cookie': 'll="118318"; bid=L_jwom1Y8f8; _pk_ref.100001.4cf6=["","",1710859202,"https://www.baidu.com/link?url=ztI_3jxw8C1HP1IioI0VZVDXlD6lik5Uf1z1hJaEdAz46IFTZOx5vUUxUltt4cyb&wd=&eqid=9301424a00565cc70000000665f9a3bc"]; _pk_id.100001.4cf6=95fefa6658059730.1710859202.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1154310134.1710859203.1710859203.1710859203.1; __utmc=30149280; __utmz=30149280.1710859203.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1912250309.1710859203.1710859203.1710859203.1; __utmc=223695111; __utmz=223695111.1710859203.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D740891AB85CA375BB745DCE5139D061C|6cd64bfcba769f09ffc6ded64f61c8f6; __yadk_uid=HWzrORdwRnmiqYyRutxd1UN7lyapl7Sc; __utmt_douban=1; __utmt=1; __utmb=30149280.12.10.1710859203; __utmb=223695111.11.10.1710859203',
    'Referfer':'https://movie.douban.com/explore'

}


url = 'https://movie.douban.com/j/chart/top_list'
# 参数一
# payload = {
#     'type': 11,
#     'interval_id': '100:90',
#     'action': '',
#     'start': 40,
#     'limit': 20
# }

i = 0
movie_queue = queue.Queue()
def fetch_movie_page(start,headers,type_, limit=20, interval_id='100:90'):
    base_url = "https://movie.douban.com/j/chart/top_list"

    # 构造动态参数
    params = {
        "type": type_,
        "interval_id": interval_id,
        "action": "",
        "start": str(start),
        "limit": str(limit),
    }

    response = requests.get(base_url, params=params,headers=headers)

    if response.status_code == 200:
        movie_data = response.json()
        # 在这里对电影数据进行进一步处理或存储

        return movie_data
    else:
        print(f"请求失败,状态码:{response.status_code}")

# 二次请求
def get_ur(move,headers):
    move_url = move['url']
    response = requests.get(url = move_url,headers=headers)

    if response.status_code == 200:
        movie_detail = response.text
        # 检查是否获取到有效内容
        if not movie_detail:
            print(f"警告:未能从《{move['title']}》获取详细信息,响应内容为空!")
            return None  # 返回None表示未获取到有效数据
        else:
            return movie_detail
    else:
        print(f"请求失败,状态码:{response.status_code}")
        return None
# 提取详细信息
def extract_detail(soup):
    # 提取导演
    director = soup.find('span', {'class': 'attrs'}).a.text
    print(f"导演: {director}")

    # 提取编剧
    writers = [w.text for w in soup.find_all('span', {'class': 'attrs'})[1].find_all('a')]
    print(f"编剧: {' / '.join(writers)}")

    # 提取主演
    actors = [a.text for a in soup.find('span', {'class': 'actor'}).find_all('a', {'rel': 'v:starring'})]
    print(f"主演: {' / '.join(actors)}")

    # 提取类型
    genres = [g.text for g in soup.find_all('span', {'property': 'v:genre'})]
    print(f"类型: {' / '.join(genres)}")

    # 提取制片国家/地区(已修复)
    country_text = '制片国家/地区:'
    country_span = soup.find('span', string=country_text)
    if country_span:
        country = country_span.find_next_sibling().text.strip()
        print(f"制片国家/地区: {country}")

    # 提取语言(已修复)
    # 提取语言信息
    # 提取语言信息
    language = soup.find('span', class_='pl', string='语言:').next_sibling.strip()
    print(f"语言: {language}")
    # 提取上映日期
    release_dates = [d.text.replace('上映日期:', '').strip() for d in
                     soup.find_all('span', {'property': 'v:initialReleaseDate'})]
    print(f"上映日期: {' / '.join(release_dates)}")

    # 提取片长
    duration = soup.find('span', {'property': 'v:runtime'}).text.replace('片长:', '').strip()
    print(f"片长: {duration}")

    # 提取又名信息
    other_names = soup.find('span', class_='pl', string='又名:').next_sibling.strip()
    print(f"又名: {other_names}")
    # 提取IMDb链接信息
    imdb_link = 'https://www.imdb.com/title/' + soup.find('span', class_='pl', string='IMDb:').next_sibling.strip()
    print(f"IMDb: {imdb_link}")
    # 将数据构造成字典
    movie_info = {
        "导演": director,
        "编剧": '/'.join(writers),
        "主演": '/'.join(actors),
        "类型": '/'.join(genres),
        "制片国家/地区": country,
        "语言": language,
        "上映日期": '/'.join(release_dates),
        "片长": duration,
        "又名": other_names,
        "IMDb链接": imdb_link
    }
    return movie_info
# 创建一个队列来存储电影数据

def process_movie_data(move,headers):
    print(move['title'])

    print(move['url'])
    # 添加随机等待时间
    wait_time = random.uniform(0.5,1)  # 设置随机等待1到3秒
    time.sleep(wait_time)

    # movie_detail 是 电影详细信息
    movie_detail = get_ur(move,headers)
    if movie_detail is not None:

        soup = BeautifulSoup(movie_detail, 'lxml')
        movie_dict = extract_detail(soup)

        print("执行了这里")
        # 创建DataFrame并将数据添加进去
        df = pd.DataFrame([movie_dict], index=[move['title']])
        # 将DataFrame添加到队列中,而不是直接写入文件
        movie_queue.put(df)
    else:
        print(f"未能获取电影《{movie['title']}》的详情信息")
print()
# 在代码中添加互斥锁
print(movie_queue)
csv_lock = threading.Lock()
def write_to_csv():
    while True:
        time.sleep(1)  # 短暂等待可能新加入的任务
        if movie_queue.empty() and concurrent.futures.wait(futures, timeout=0.1).done:  # 检查futures是否都已完成且队列为空
            break  # 如果队列为空且所有任务完成,则停止循环入
        df = movie_queue.get()
        with csv_lock:
            with open('movies.csv', 'a', encoding='utf-8-sig') as f:
                df.to_csv(f, mode='a', header=False, index_label='电影名称')
                print("是不是没有执行")

# 启动写入CSV文件的线程
csv_writer_thread = threading.Thread(target=write_to_csv)
csv_writer_thread.start()

# 使用这个函数时,只需传入不同的start值
move_type_list = [11,]    # 电影类型
# 使用多线程
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # 可根据实际情况调整线程数
    futures = []
    for move_type in move_type_list:
        for start in range(0, 201,20):  # 每个类型的影片数量 要根据这个类型实际有多少进行调节
            move_data_list = fetch_movie_page(start, headers, move_type)
            if move_data_list:  # 检查是否成功获取到数据
                for movie in move_data_list:
                    future = executor.submit(process_movie_data, movie, headers)
                    futures.append(future)
                    time.sleep(random.uniform(0.5, 1))  # 保持随机等待时间
    for future in concurrent.futures.as_completed(futures):
            future.result()





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值