requests和beautifulSoup库爬取豆瓣各类型电影

代码:

# -*-coding:utf-8-*-
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import requests
import time
import json
import random


#http请求头
Hostreferer = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36',
    'Referer':'https://movie.douban.com/subject/'
               }


#设置代理
ip_list = ['61.155.164.109:3128', '117.158.57.2:3128', '123.207.25.143:3128', '61.155.164.107:3128','61.155.164.111:3128','61.4.184.180:3128']
def use_requests():
    html = requests.get('http://icanhazip.com', proxies={'http': 'http://' + random.choice(ip_list)}).text
    print(html)
    
    
def driverHtml(url):
    try:
        
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        sourcePage = driver.page_source
        soup = BeautifulSoup(sourcePage, "lxml")
        return soup
    except:
        return ""
    
def getMoviceHref(soup):
    HrefInfo = soup.find('div',attrs={'class':'types'})
    tmp = r'/[^\s]*.action='
    HrefList = re.findall(tmp,str(HrefInfo))
    #print(HrefList)
    pattern = 'type=[0-9]+'
    typeNum = re.findall(pattern,str(HrefList))
    print(typeNum)
    return HrefList,typeNum


def getTypeHtml(typeNum):
    index = 0
    max = len(typeNum)
    for i in range(8,max):
        use_requests()
        for j in range(1,8):
            #use_requests()
            num = 20 * j
            time.sleep(5)
            typeurl = JsonStart_url + str(typeNum[i]) + JsonMid_url + str(num) + JsonEnd_url
            print(typeurl)
            tx = getHTMLtext(typeurl)
            if tx == "":
                break
            svalues = json.loads(tx)
            #print(svalues)
            for k in range(len(svalues)):
                #重新设置代理
                #use_requests()
                print(svalues[k]['url'])
                MovUrl = svalues[k]['url'].replace('\/','/')
                #print(svalues[k]['url'])
                #MoviceUrl.append(svalues[k]['url'])
                # 睡眠5毫秒
                time.sleep(5)
                htmlText = getHTMLtext(MovUrl)
                if htmlText == "":
                    continue
                soups = BeautifulSoup(htmlText, 'html.parser')
                index = index+1
                getMovieInfo(i,j,k,index,soups)




def getHTMLtext(url,code='utf-8'):
    try:
        r = requests.get(url,headers = Hostreferer)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return ""


def getMovieInfo(i,j,k,index,soups):
    try:
        InfoList = []
        Infos = soups.find('div', attrs={'id': 'info'})
        # print(Infos)
        if Infos == "":
            print("第{0}类的第{1}页的第{2}个爬取失败...".format(i + 1, j, k + 1))
            pass
        else:
            print("正在抓取第{0}类的第{1}页的第{2}个电影信息...".format(i + 1, j, k + 1))
            # 电影导演
            director = Infos.find_all(attrs={'rel': 'v:directedBy'})[0].text.split()[0]
            #print("导演:" + director)
            # 获取电影名字
            name = soups.find('span', attrs={'property': 'v:itemreviewed'}).text.split()[0]
            #print("影名:" + name)
            # 电影主演
            actors = soups.find_all(attrs={'rel': 'v:starring'})
            atr = ""
            for at in actors:
                atr = atr + at.text.split()[0] + '/'
            #print("主演:" + atr)
            # 电影类型
            type = soups.find_all('span', attrs={'property': 'v:genre'})
            ts = ""
            for t in type:
                ts = ts + t.text.split()[0] + '/'
            #print("类型:" + ts)
            # 电影上映时间
            time = soups.find_all(attrs={'property': 'v:initialReleaseDate'})
            ti = ""
            for t in time:
                ti = ti + t.text.split()[0] + '/'
            #print("上映时间:" + ti)
            # 电影放映时间长短
            runtime = soups.find_all(attrs={'property': 'v:runtime'})
            rt = ""
            for r in runtime:
                rt = rt + r.text.split()[0] + '/'
            #print("放映时间:" + rt)
            # 豆瓣评分
            average = soups.find_all(attrs={'property': 'v:average'})[0].text.split()[0]
            #print("豆瓣评分:" + average)
            # 参与评价人数
            rating_vote = soups.find_all(attrs={'property': 'v:votes'})[0].text.split()[0] + "人评论"
            #print("参加评论人数:" + rating_vote)
            # 5星好评比例
            starts5 = soups.find_all(attrs={'class': 'rating_per'})[0].text.split()[0]
            #print("五星好评比例:" + starts5)
        
            InfoList.append([director, name, atr, ts, ti, rt, average, rating_vote, starts5])
        
            fw.write(",".join(InfoList[0]) + "\n")
            print("成功写入{0}个".format(index))
        
            print("---------------------------------------------------------")
    except:
        pass


def main():
    #使用selenium工具获取html网页源码
    soup = driverHtml(url)
    #获取每种类型电影的url
    HrefList,typeNum = getMoviceHref(soup)
    #根据每种类型电影,获取该类型下每个电影的url
    getTypeHtml(typeNum)
    
    
if __name__ == '__main__':
    index = 1
    # 电影分类排行榜
    url = 'https://movie.douban.com/chart'
    start_url = 'https://movie.douban.com'
    MoviceUrl = []
    JsonStart_url = 'https://movie.douban.com/j/chart/top_list?'
    JsonMid_url = '&interval_id=100%3A90&action=&start='
    JsonEnd_url = '&limit=20'
    fw = open("douban.csv", 'a+')
    row = ["电影导演", "电影名字", "电影主演", "电影类型", "上映时间", "放映时间", "豆瓣评分", "参加评论人数", "五星好评比例"]
    fw.write(",".join(row) + "\n")
    main()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值