爬取豆瓣电影TOP250的数据/存储到MySQL

 

 

 

import requests
from lxml import etree
import re
import pymysql
import time

conn = pymysql.connect(host='localhost', user='root', passwd='cjlushenbin', db='my_database', port=3306, charset='utf8')
cursor = conn.cursor()

headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}

def get_movie_url(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    movie_hrefs = selector.xpath('//div[@class="hd"]/a/@href')
    for movie_href in movie_hrefs:
        get_movie_info(movie_href)

def get_movie_info(url):
    html = requests.get(url,headers=headers)
    selector = etree.HTML(html.text)
    try:
        name = selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
        director = selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
        actor = selector.xpath('string(//*[@id="info"]/span[3]/span[2])')
        style = re.findall('<span property="v:genre">(.*?)</span>',html.text,re.S)
        style = '/'.join(style)
        # style = selector.xpath('//span[@property="v:genre"]//text()')
        # style = '/'.join(style)
        country = re.findall('<span class="pl">制片国家/地区:</span> (.*?)<br/>',html.text,re.S)[0]
        release_time = selector.xpath('//span[@property="v:initialReleaseDate" ]//text()')
        release_time = '/'.join(release_time)
        # time = re.findall('片长:</span>.*?>(.*?)</span>',html.text,re.S)[0]
        time = re.findall('<span class="pl">片长:</span>.*?>(.*?)</span>(.*?)<br/>',html.text,re.S)[0]
        time = ''.join(time)
        score = selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
        print(name,director,actor,style,country,release_time,time,score)
        cursor.execute(
            "insert into doubanmovie (name,director,actor,style,country,release_time,time,score) values(%s,%s,%s,%s,%s,%s,%s,%s)",
            (str(name), str(director), str(actor), str(style), str(country), str(release_time), str(time), str(score)))

    except IndexError:
        pass

if __name__ == '__main__':
    urls = ['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0, 250, 25)]
    for url in urls:
        get_movie_url(url)
        time.sleep(1)
    conn.commit()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值