python爬取近五年的华语电影,并储存到excel表

帮群里的一个小朋友写的,这些个名字不是我起的,大学生的作业,勿喷。

第n次更新,加了个获取快代理的免费代理,避免被豆瓣的反爬虫给怼自闭,不过还是有个小bug,就是爬取完成后不会停,如果一直在打印ip代理就手动停止一下吧。收工了,有啥问题可以扫码加我企业微信讨论。

代码如下,仅供参考:

import time
import xlwt
from lxml import etree
import requests
import json
import random
import requests
from bs4 import BeautifulSoup

# 获取网站数据
def get_data(url):
    headers = {
        'user-agent': 'Mozilla/5.0'
    }
    html = requests.get(url, headers)
    html.encoding = 'utf-8'
    return html.text
# 解析网站数据
def parse_dara(html):
    soup = BeautifulSoup(html, 'html.parser')
    '''
        protocol = soup.find_all(attrs={'data-title': '类型'})
        ip = soup.find_all(attrs={'data-title': 'IP'})
        port = soup.find_all(attrs={'data-title': 'PORT'})
    '''
    # 协议 地址 端口
    protocol = soup.select('#list > table > tbody > tr > td:nth-child(4)')
    ip = soup.select('#list > table > tbody > tr > td:nth-child(1)')
    port = soup.select('#list > table > tbody > tr > td:nth-child(2)')
    data = []       # 存放代理链接
    for i in range(0, len(ip)):          # 要求len(ip), len(port) len(protocol)的值一样
        temp = protocol[i].get_text()+'://'+ip[i].get_text()+':'+port[i].get_text()     # 拼接成url
        data.append(temp)                                                               # 拼接后的数据,加入到列表
    return data

# 保存数据
def save_data(data):
    for item in data:
        with open(proxy, 'a+') as f:
            f.write(item)
            f.write('\n')

def processing_data(content_list):
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding='utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('My Worksheet')
    # 写入excel
    for i, content in enumerate(content_list):
        for x, info in enumerate(content):
            worksheet.write(i, x, label=info)  # 将数据存入excel
    # 保存
    workbook.save('电影信息.xls')


def save_info(s, content):
    info = content.xpath("//div[@id='info']")[0]
    try:
        name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
    except:
        name = "无"
    try:
        daoyan =  str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
    except:
        daoyan = "无"
    try:
        bianju =  str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
    except:
        bianju = "无"
    try:
        zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
    except:
        zhuyan = "无"
    try:
        leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
    except:
        leixing = "无"
    try:
        shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")
    except:
        shangyingshijian = "无"
    try:
        shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
    except:
        shichang = "无"
    try:
        pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
    except:
        pingfen = "无"
    try:
        jianjie =  str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")
    except:
        jianjie = "无"
    # tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")
    try:
        pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
    except:
        pingjiarenshu = "无"
    print("爬取第%d部%s年上映,%s主演的%s" % (s, shangyingshijian[0:4], zhuyan, name))
    # print("电影名称:", name)
    # print("导演:", daoyan)
    # print("编剧:", bianju)
    # print("主演:", zhuyan)
    # print("评分:", pingfen)
    # print("评价人数:", pingjiarenshu)
    # print("类型:", leixing)
    # print("上映时间:", shangyingshijian)
    # print("时长:", shichang)
    # print("简介:", jianjie)
    # print("图片url:", tupian)
    one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]
    all_list.append(one_info)


def main():
    s =0
    i = 0
    try:
        for x in range(0, 9999):
            url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=time&page_limit=20&page_start='+ str(x*20)

            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
                'Cookie': 'bid=8u7taHNdsWM; __utmc=30149280; __utmc=223695111; __utmz=223695111.1607998669.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __yadk_uid=9x4B44CN2IsA8mMQ5aAyjQ4SaozNfPF2; __gads=ID=faf2684739e4c7f2-22e5424930c50003:T=1607998670:RT=1607998670:S=ALNI_MYbSVvFUx-vDkas8JkBXbnxevAHWA; ll="118282"; ct=y; _vwo_uuid_v2=DE86177D6BC486F18E203C7287F2B1E77|1fd9d3b9c304cda3f3602953aa741fcc; dbcl2="228452659:QZuIW0RNFQA"; ck=Z6d9; push_noty_num=0; push_doumail_num=0; __utma=30149280.78821852.1607998669.1608094761.1608104129.3; __utmz=30149280.1608104129.3.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmv=30149280.22845; __utmb=30149280.2.10.1608104129; __utma=223695111.1226569761.1607998669.1608094761.1608104244.3; __utmb=223695111.0.10.1608104244; _pk_id.100001.4cf6=1b0982adf0b4c756.1607998669.3.1608104244.1608095066.; _pk_ses.100001.4cf6=*'
            }
            with open("proxy.txt", "r") as f:  # 打开文件
                data = f.readlines()  # 读取文件
                summ = len(data)
                proxyss = data[i].replace("\n", "")
                i+=1
                if i == summ-1:
                    i = 0
                proxies = {'http': '{}'.format(proxyss)}
                print(proxies)
            content = requests.get(url, proxies=proxies, headers=headers)
            if content.status_code != 200:
                print('出错了')
            content_json = json.loads(content.text)["subjects"]
            # if not content_json:
            #     break
            for one_info in content_json:
                one_id = one_info["id"]
                print(one_id)
                url2 = "https://movie.douban.com/subject/%s/" % one_id
                # content_html = requests.get(url, headers=headers)
                html = requests.get(url2, proxies=proxies, headers=headers)
                if html.status_code == 200:
                    content = html.content.decode("utf-8")
                    content = etree.HTML(content)
                    s += 1
                    save_info(s, content)
                else:
                    print('出错了')
                time.sleep(1)
    except:
        processing_data(all_list)


if __name__ == '__main__':
    proxy = 'proxy.txt'
    url = 'https://www.kuaidaili.com/free/inha/1'
    html = get_data(url)
    data = parse_dara(html)
    save_data(data)
    print('获取免费代理结束')
    all_list = []
    main()
    processing_data(all_list)

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值