51job :250条数据爬取(函数)

import requests
from bs4 import BeautifulSoup
from MyTools import getHtml,MySave
import pandas  as pd
from pandas import Series,DataFrame
mySave=MySave()#实例化存储

#所有的链接
urls=[f'https://movie.douban.com/top250?start={25*i}&filter=' for i in range(10)]

res= getHtml(urls[0])
#转为 BeautifulSoup对象提取内容
soup=BeautifulSoup(res.text,'html.parser')
#提取一页信息
def getOnePage(res):
    soup=BeautifulSoup(res.text,'html.parser')
    parent = soup.find(class_='grid_view')
    divs= parent.find_all(class_='item')
    data=[]
    for each in divs:
        rank=each.find('em').text
        filmName=each.find('img').attrs['alt']
        #href
        href=each.find('a').attrs['href']
        data.append([rank,filmName,href])
    return data

#代码新建文件夹
import os
#os.path.exists('csv') 判断是否在csv文件夹
if not os.path.exists('csv'):
    #新建
    os.mkdir('csv')
    
import time 
import random
def main():
    for index,each in enumerate(urls):
        print(f'正在抓取第{index+1}页')
        time.sleep(random.randint(2,5))
        #访问
        res = getHtml(each)
        #提取
        data=getOnePage(res)
        #储存
        mySave.saveToCsv(data,'csv/豆瓣.csv','a')

        listInfo=pd.read_csv("csv/豆瓣.csv",names=['rand','name','href'])
infos=listInfo.values
def get_information(url):
    res = getHtml(url)
    soup = BeautifulSoup(res.text,'html.parser')
    parentl = soup.find(id='info')
    data = []
    #  获取导演
    try:
        director = parentl.find_all(class_='attrs')[0].text
    except:
        director = ''

    #  获取导编剧
    try:
        writer = parentl.find_all(class_='attrs')[1].text
    except:
        writer = ''

    #  获取主演
    try:
        actors = parentl.find_all(class_='attrs')[2].text
    except:
        actors = ''

    #  获取电影类型
    try:
        movieType = [each.text for each in parentl.find_all('span',attrs={'property':'v:genre'})]
    except:
        movieType = []

    #  获取 制片国家 地区
    try:
        area = parentl.find(text='制片国家/地区:').parent.next_sibling.string
    except:
        area = ''

    #  获取上映日期
    try:
        year = [each.text for each in parentl.find_all('span',attrs={'property':'v:initialReleaseDate'})]
    except:
        year = []

    #  获取片长
    try:
        duration = parentl.find('span',attrs={'property':'v:runtime'}).text
    except:
        duration = ''
    
    try:
    #  获取评分
        score = soup.find(class_='rating_num').text
    except:
        score = ''
    
    try:
    #  获取评价人数
        commentNumber = soup.find('span',attrs={'property':'v:votes'}).text
    except:
        commentNumber = ''

    try:
    #  电影标签
        text = [each.text for each in soup.find(class_='tags-body').find_all('a')]
    except:
        text = ''

    data.append([director, writer, actors, movieType, area, year, duration, score, commentNumber, text])
    return data
#保存
def getAllHerf(start, end):
    for i in range(start, end):
        print(f'正在抓取第{i+1}个:,{infos[i][1]}')
        url = infos[i][-1]
        print(url)
        data = get_information(url)
        DataFrame(data).to_csv('csv/嘿嘿嘿.csv',mode='a',index=False,header=False)
        print('保存完成')
getAllHerf(0,250)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值