python爬虫(一)

python 爬取豆瓣电影TOP250

  • 利用第三方库requests实现请求
  • xpath解析网页
  • 采用MYSQL存储


requests

import requests

lxml

from lxml import etree (采用xpath解析网页)

pymysql

import pymysql  (操作MYSQL)  

数据库DDL

DROP  TABLE db250;
CREATE TABLE db250
(
  排名   INT DEFAULT '0' NOT NULL,
  中文影名 VARCHAR(50)     NULL,
  英文影名 VARCHAR(80)     NULL,
  评分   FLOAT           NULL,
  导演   VARCHAR(60)     NULL,
  主演   VARCHAR(60)     NULL,
  时间   VARCHAR(50)      NULL,
  地区   VARCHAR(20)     NULL,
  类型   VARCHAR(15)     NULL,
  影评   VARCHAR(100)    NULL
)

数据库链接

self.db = pymysql.Connect(host='127.0.0.1', port=3306, user='root',passwd='密码', db='数据库名', charset='utf8')
self.cursor = self.db.cursor()

构建网页链接

self.base_url = 'https://movie.douban.com/top250?start='
self.url_list = [self.base_url + str(i) + "&filter=" for i in range(0, 226, 25) ]

请求及解析网页

for url in self.url_list:

        #请求
        req = requests.get(url, headers=self.headers)
        # 设置编码
        req.encoding = 'utf-8'
        #解析网页
        root = etree.HTML(req.content)

        texts = root.xpath('//ol/li/div[@class="item"]')

        for text in texts:
            # 初始化电影名
            Chinese_Name = ''
            English_Name = ''
            director = 'NULL'
            actor = 'NULL'

            # 排名
            rank = int(text.xpath('./div[@class="pic"]/em/text()')[0]) 
            # 链接
            movie_url = text.xpath('./div[@class="pic"]/a/@href')[0]
            # 电影名
            name = text.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
            # 导演和主演
            director_actor = text.xpath('./div[@class="info"]/div[@class="bd"]//p[@class]/text()')
            # 评分
            rating_num = text.xpath('./div[@class="info"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
            # 影评
            quote = text.xpath('./div[@class="info"]//p[@class="quote"]/span[@class="inq"]/text()')

代码

import requests
import pymysql
from lxml import etree

class DB250_spride(object):

    def __init__(self):
        self.base_url = 'https://movie.douban.com/top250?start='
        self.url_list = [self.base_url + str(i) + "&filter=" for i in range(0, 226, 25) ]
        self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            }
        # 链接数据库
        self.db = pymysql.Connect(host='127.0.0.1', port=3306, user='root',passwd='密码', db='yw_test', charset='utf8')
        self.cursor = self.db.cursor()

    def Parse_url_and_InsertDB(self):

        for url in self.url_list:

            #请求
            req = requests.get(url, headers=self.headers)
            # 设置编码
            req.encoding = 'utf-8'
            #解析网页
            root = etree.HTML(req.content)

            texts = root.xpath('//ol/li/div[@class="item"]')

            for text in texts:

                # 初始化电影名
                Chinese_Name = ''
                English_Name = ''
                director = 'NULL'
                actor = 'NULL'


                # 排名
                rank = int(text.xpath('./div[@class="pic"]/em/text()')[0]) 
                # 链接
                movie_url = text.xpath('./div[@class="pic"]/a/@href')[0]
                # 电影名
                name = text.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
                # 导演和主演
                director_actor = text.xpath('./div[@class="info"]/div[@class="bd"]//p[@class]/text()')
                # 评分
                rating_num = text.xpath('./div[@class="info"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
                # 影评
                quote = text.xpath('./div[@class="info"]//p[@class="quote"]/span[@class="inq"]/text()')




                if len(name) == 2:
                    Chinese_Name = name[0]
                    English_Name = str(name[1]).replace('/', "")
                    English_Name = English_Name.replace("'", "''")
                    English_Name = English_Name.lstrip()
                else:
                    Chinese_Name = name[0]

                # 过滤字符
                director_actor[0] = director_actor[0].replace(' ', '')
                director_actor[1] = director_actor[1].replace(' ', '') 

                director_actor[0] = director_actor[0].replace('\xa0'*3, '  ')
                director_actor[1] = director_actor[1].replace('\xa0/\xa0', '  ')
                director_actor[0] = director_actor[0].replace('\n', '')
                print_length = len(director_actor[0]) + len(director_actor[1])+len(rating_num) + 20
                print("*"*print_length, end='\n\n')
                #print("排名:",rank,"中文影名:",Chinese_Name," 英文影名:", English_Name,"评分:", rating_num, end='\n')
                #print(director_actor[0],director_actor[1],sep='\n')
                print(rank)
                print(Chinese_Name)
                print(English_Name)
                print(rating_num)

                #提取导演名和主演名
                if len(director_actor[0].split('  ')) == 2:
                    director, actor = director_actor[0].split("  ")[0], director_actor[0].split("  ")[1]
                    actor = actor.replace("主演:", '')
                    actor = actor.replace("'", "''")    #字符串带有单引号,写入数据库要转义
                else:
                    director = director_actor[0].split("  ")[0]
                director = director.replace('导演:', '')
                director = director.replace("'", "''")  #字符串带有单引号,写入数据库要转义

                #提取时间 地区 类型
                year, location, types = director_actor[1].split('  ')[0], director_actor[1].split('  ')[1], director_actor[1].split('  ')[2]
                print(year)
                print(location)
                print(types)
                print(director)
                print(actor)
                # print(director_actor[0])
                # print(director_actor[1])
                # print(float(rating_num))

                #有些电影没有评论
                if quote:
                    print(quote[0])
                    quote = quote[0].replace("'", "''")
                else:
                    quote = 'NULL'

                print("*"*print_length, end='\n\n')

                ISSQL = "insert into yw_test.db250(排名, 中文影名, 英文影名, 评分, 导演, 主演, 时间, 地区, 类型, 影评) values\
                        (%d, '%s', '%s', %.1f, '%s','%s', '%s', '%s', '%s', '%s')"\
                        %(rank, Chinese_Name, English_Name, float(rating_num), director, actor, year, location, types, quote)

                try:
                    self.cursor.execute(ISSQL)
                    self.db.commit()
                except Exception as e:
                    self.db.rollback()
                    print(e)


    def CloseDB(self):
        self.cursor.close()
        self.db.close()

def main():
    spride1 = DB250_spride()
    spride1.Parse_url_and_InsertDB()



if __name__ == '__main__':
    main()

这里写图片描述
1


  1. 技术有限,欢迎大家评论指出,一起交流学习。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值