这是一个可改进的触手程序,存在可能无法写入的问题这有可能是程序逻辑的问题,或线程结束后。统一写入的问题,程序运行容易被检测到封掉ip (建议网上购买可用ip)
经过改良后可实现万条豆瓣电影详细数据爬取。
下面是代码: 需要有一定的python基础: 像我这样的学生可以尝试拿去自己需要部分代码做测试学习。
import pprint
from bs4 import BeautifulSoup
import requests
import random
import time
import pandas as pd
import concurrent.futures
import queue
import threading
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Cookie': 'll="118318"; bid=L_jwom1Y8f8; _pk_ref.100001.4cf6=["","",1710859202,"https://www.baidu.com/link?url=ztI_3jxw8C1HP1IioI0VZVDXlD6lik5Uf1z1hJaEdAz46IFTZOx5vUUxUltt4cyb&wd=&eqid=9301424a00565cc70000000665f9a3bc"]; _pk_id.100001.4cf6=95fefa6658059730.1710859202.; _pk_ses.100001.4cf6=1; ap_v=0,6.0; __utma=30149280.1154310134.1710859203.1710859203.1710859203.1; __utmc=30149280; __utmz=30149280.1710859203.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.1912250309.1710859203.1710859203.1710859203.1; __utmc=223695111; __utmz=223695111.1710859203.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D740891AB85CA375BB745DCE5139D061C|6cd64bfcba769f09ffc6ded64f61c8f6; __yadk_uid=HWzrORdwRnmiqYyRutxd1UN7lyapl7Sc; __utmt_douban=1; __utmt=1; __utmb=30149280.12.10.1710859203; __utmb=223695111.11.10.1710859203',
'Referfer':'https://movie.douban.com/explore'
}
url = 'https://movie.douban.com/j/chart/top_list'
# 参数一
# payload = {
# 'type': 11,
# 'interval_id': '100:90',
# 'action': '',
# 'start': 40,
# 'limit': 20
# }
i = 0
movie_queue = queue.Queue()
def fetch_movie_page(start,headers,type_, limit=20, interval_id='100:90'):
base_url = "https://movie.douban.com/j/chart/top_list"
# 构造动态参数
params = {
"type": type_,
"interval_id": interval_id,
"action": "",
"start": str(start),
"limit": str(limit),
}
response = requests.get(base_url, params=params,headers=headers)
if response.status_code == 200:
movie_data = response.json()
# 在这里对电影数据进行进一步处理或存储
return movie_data
else:
print(f"请求失败,状态码:{response.status_code}")
# 二次请求
def get_ur(move,headers):
move_url = move['url']
response = requests.get(url = move_url,headers=headers)
if response.status_code == 200:
movie_detail = response.text
# 检查是否获取到有效内容
if not movie_detail:
print(f"警告:未能从《{move['title']}》获取详细信息,响应内容为空!")
return None # 返回None表示未获取到有效数据
else:
return movie_detail
else:
print(f"请求失败,状态码:{response.status_code}")
return None
# 提取详细信息
def extract_detail(soup):
# 提取导演
director = soup.find('span', {'class': 'attrs'}).a.text
print(f"导演: {director}")
# 提取编剧
writers = [w.text for w in soup.find_all('span', {'class': 'attrs'})[1].find_all('a')]
print(f"编剧: {' / '.join(writers)}")
# 提取主演
actors = [a.text for a in soup.find('span', {'class': 'actor'}).find_all('a', {'rel': 'v:starring'})]
print(f"主演: {' / '.join(actors)}")
# 提取类型
genres = [g.text for g in soup.find_all('span', {'property': 'v:genre'})]
print(f"类型: {' / '.join(genres)}")
# 提取制片国家/地区(已修复)
country_text = '制片国家/地区:'
country_span = soup.find('span', string=country_text)
if country_span:
country = country_span.find_next_sibling().text.strip()
print(f"制片国家/地区: {country}")
# 提取语言(已修复)
# 提取语言信息
# 提取语言信息
language = soup.find('span', class_='pl', string='语言:').next_sibling.strip()
print(f"语言: {language}")
# 提取上映日期
release_dates = [d.text.replace('上映日期:', '').strip() for d in
soup.find_all('span', {'property': 'v:initialReleaseDate'})]
print(f"上映日期: {' / '.join(release_dates)}")
# 提取片长
duration = soup.find('span', {'property': 'v:runtime'}).text.replace('片长:', '').strip()
print(f"片长: {duration}")
# 提取又名信息
other_names = soup.find('span', class_='pl', string='又名:').next_sibling.strip()
print(f"又名: {other_names}")
# 提取IMDb链接信息
imdb_link = 'https://www.imdb.com/title/' + soup.find('span', class_='pl', string='IMDb:').next_sibling.strip()
print(f"IMDb: {imdb_link}")
# 将数据构造成字典
movie_info = {
"导演": director,
"编剧": '/'.join(writers),
"主演": '/'.join(actors),
"类型": '/'.join(genres),
"制片国家/地区": country,
"语言": language,
"上映日期": '/'.join(release_dates),
"片长": duration,
"又名": other_names,
"IMDb链接": imdb_link
}
return movie_info
# 创建一个队列来存储电影数据
def process_movie_data(move,headers):
print(move['title'])
print(move['url'])
# 添加随机等待时间
wait_time = random.uniform(0.5,1) # 设置随机等待1到3秒
time.sleep(wait_time)
# movie_detail 是 电影详细信息
movie_detail = get_ur(move,headers)
if movie_detail is not None:
soup = BeautifulSoup(movie_detail, 'lxml')
movie_dict = extract_detail(soup)
print("执行了这里")
# 创建DataFrame并将数据添加进去
df = pd.DataFrame([movie_dict], index=[move['title']])
# 将DataFrame添加到队列中,而不是直接写入文件
movie_queue.put(df)
else:
print(f"未能获取电影《{movie['title']}》的详情信息")
print()
# 在代码中添加互斥锁
print(movie_queue)
csv_lock = threading.Lock()
def write_to_csv():
while True:
time.sleep(1) # 短暂等待可能新加入的任务
if movie_queue.empty() and concurrent.futures.wait(futures, timeout=0.1).done: # 检查futures是否都已完成且队列为空
break # 如果队列为空且所有任务完成,则停止循环入
df = movie_queue.get()
with csv_lock:
with open('movies.csv', 'a', encoding='utf-8-sig') as f:
df.to_csv(f, mode='a', header=False, index_label='电影名称')
print("是不是没有执行")
# 启动写入CSV文件的线程
csv_writer_thread = threading.Thread(target=write_to_csv)
csv_writer_thread.start()
# 使用这个函数时,只需传入不同的start值
move_type_list = [11,] # 电影类型
# 使用多线程
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # 可根据实际情况调整线程数
futures = []
for move_type in move_type_list:
for start in range(0, 201,20): # 每个类型的影片数量 要根据这个类型实际有多少进行调节
move_data_list = fetch_movie_page(start, headers, move_type)
if move_data_list: # 检查是否成功获取到数据
for movie in move_data_list:
future = executor.submit(process_movie_data, movie, headers)
futures.append(future)
time.sleep(random.uniform(0.5, 1)) # 保持随机等待时间
for future in concurrent.futures.as_completed(futures):
future.result()