爬取电影排行榜

网络爬虫练习——爬取豆瓣华语电影排行榜网页信息

爬取的网页链接是豆瓣华语电影排行榜,使用了python3中的requests来请求网页、xpath解析文本,还使用了谷歌浏览器的headers来模拟浏览器,最后是将爬取的数据保存在电脑的f盘的TXT文件中,获取的信息包括电影的排名、电影名称、电影简介,以下是代码:

from requests.exceptions import RequestException
from lxml import etree
import requests
import json
import time
def get_page(url):
    try:
        headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            print('Successful')
            return response.text
    except RequestException:
        print('Failed')
def parse_page(detail_url):
    #解析html信息
    html = etree.HTML(detail_url)
    infos=html.xpath('//*[@id="content"]/div/div[1]')
    time.sleep(1)
    #movies=[]
    for info in infos:
        yield{
            'position':info.xpath('.//div[@class="doulist-item"]/div/div[1]/span/text()'),
            'name':info.xpath('.//div[@class="doulist-item"]/div/div[2]/div[4]/a/text()'),
            'abstract':info.xpath('.//div[@class="doulist-item"]/div/div[2]/div[6]/text()')
        }
def write_text(item):
    with open('f:/result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
def main(url):
    detail_url = get_page(url)
    if detail_url is not None:
        for item in parse_page(detail_url):
            write_text(item)
            print(item)
if __name__ == '__main__':
    for i in range(0,4):
        url = 'https://www.douban.com/doulist/13704241/?start={}&sort=seq&playable=0&sub_type='.format(i*25)
        main(url)

进一步对上面的程序做了部分改进即将爬取的数据保存到mysql数据库中

from requests.exceptions import RequestException
from lxml import etree
import requests
import pymysql
import time
def set_mysql():
    db = pymysql.connect(host='localhost', user='root', password='root123456', port=3306)
    cursor = db.cursor()
    #创建了名为movie的数据库
    cursor.execute("CREATE DATABASE movie DEFAULT CHARACTER SET utf8")
    db = pymysql.connect(host='localhost', user='root', password='root123456', port=3306, db='movie')
    cursor = db.cursor()
    #在库中创建名为movies的表
    sql = 'CREATE TABLE IF NOT EXISTS movies(position INT(255)NOT NULL,name VARCHAR(255)NOT NULL,abstract VARCHAR(255)NOT NULL,PRIMARY KEY(position)) '
    cursor.execute(sql)
    db.close()
def get_page(url):
    try:
        headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            print('Successful')
            return response.text
    except RequestException:
        print('Failed')
def parse_page(detail_url):
    html = etree.HTML(detail_url)
    infos = html.xpath('//*[@id="content"]/div/div[1]')
    movies = []
    for info in infos:
        position = info.xpath('.//div[@class="doulist-item"]/div/div[1]/span/text()')
        name = info.xpath('.//div[@class="doulist-item"]/div/div[2]/div[4]/a/text()')
        abstract = info.xpath('.//div[@class="doulist-item"]/div/div[2]/div[6]/text()')
        time.sleep(1)
    set_mysql()
    for position, name, abstract in zip(position, name, abstract):
        db = pymysql.connect(host='localhost', user='root', password='root123456', port=3306, db='movie')
        cursor = db.cursor()
        sql = "insert into movies values (\"%s\",\"%s\",\"%s\")" %(str(position),str(name), str(abstract))
        cursor.execute(sql)
        db.commit()
        db.close()
def main(url):
    detail_url = get_page(url)
    if detail_url is not None:
        parse_page(detail_url)
if __name__ == '__main__':
    for i in range(0,4):
        url = 'https://www.douban.com/doulist/13704241/?start={}&sort=seq&playable=0&sub_type='.format(i*25)
        main(url)


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值