import requests
from bs4 import BeautifulSoup
from MyTools import getHtml,MySave
import pandas as pd
from pandas import Series,DataFrame
mySave=MySave()#实例化存储
#所有的链接
urls=[f'https://movie.douban.com/top250?start={25*i}&filter=' for i in range(10)]
res= getHtml(urls[0])
#转为 BeautifulSoup对象提取内容
soup=BeautifulSoup(res.text,'html.parser')
#提取一页信息
def getOnePage(res):
soup=BeautifulSoup(res.text,'html.parser')
parent = soup.find(class_='grid_view')
divs= parent.find_all(class_='item')
data=[]
for each in divs:
rank=each.find('em').text
filmName=each.find('img').attrs['alt']
#href
href=each.find('a').attrs['href']
data.append([rank,filmName,href])
return data
#代码新建文件夹
import os
#os.path.exists('csv') 判断是否在csv文件夹
if not os.path.exists('csv'):
#新建
os.mkdir('csv')
import time
import random
def main():
for index,each in enumerate(urls):
print(f'正在抓取第{index+1}页')
time.sleep(random.randint(2,5))
#访问
res = getHtml(each)
#提取
data=getOnePage(res)
#储存
mySave.saveToCsv(data,'csv/豆瓣.csv','a')
listInfo=pd.read_csv("csv/豆瓣.csv",names=['rand','name','href'])
infos=listInfo.values
def get_information(url):
res = getHtml(url)
soup = BeautifulSoup(res.text,'html.parser')
parentl = soup.find(id='info')
data = []
# 获取导演
try:
director = parentl.find_all(class_='attrs')[0].text
except:
director = ''
# 获取导编剧
try:
writer = parentl.find_all(class_='attrs')[1].text
except:
writer = ''
# 获取主演
try:
actors = parentl.find_all(class_='attrs')[2].text
except:
actors = ''
# 获取电影类型
try:
movieType = [each.text for each in parentl.find_all('span',attrs={'property':'v:genre'})]
except:
movieType = []
# 获取 制片国家 地区
try:
area = parentl.find(text='制片国家/地区:').parent.next_sibling.string
except:
area = ''
# 获取上映日期
try:
year = [each.text for each in parentl.find_all('span',attrs={'property':'v:initialReleaseDate'})]
except:
year = []
# 获取片长
try:
duration = parentl.find('span',attrs={'property':'v:runtime'}).text
except:
duration = ''
try:
# 获取评分
score = soup.find(class_='rating_num').text
except:
score = ''
try:
# 获取评价人数
commentNumber = soup.find('span',attrs={'property':'v:votes'}).text
except:
commentNumber = ''
try:
# 电影标签
text = [each.text for each in soup.find(class_='tags-body').find_all('a')]
except:
text = ''
data.append([director, writer, actors, movieType, area, year, duration, score, commentNumber, text])
return data
#保存
def getAllHerf(start, end):
for i in range(start, end):
print(f'正在抓取第{i+1}个:,{infos[i][1]}')
url = infos[i][-1]
print(url)
data = get_information(url)
DataFrame(data).to_csv('csv/嘿嘿嘿.csv',mode='a',index=False,header=False)
print('保存完成')
getAllHerf(0,250)
51job :250条数据爬取(函数)
最新推荐文章于 2022-09-21 17:31:02 发布