python爬虫(一)

最新推荐文章于 2024-08-06 11:55:39 发布

西瓜味的代码

最新推荐文章于 2024-08-06 11:55:39 发布

阅读量270

点赞数

分类专栏： python爬虫

本文链接：https://blog.csdn.net/qq_37594187/article/details/80155295

版权

python爬虫专栏收录该内容

1 篇文章

订阅专栏

python 爬取豆瓣电影TOP250

利用第三方库requests实现请求
xpath解析网页
采用MYSQL存储

python 爬取豆瓣电影TOP250

库

requests

import requests

lxml

from lxml import etree (采用xpath解析网页)

pymysql

import pymysql  (操作MYSQL)

数据库DDL

DROP  TABLE db250;
CREATE TABLE db250
(
  排名   INT DEFAULT '0' NOT NULL,
  中文影名 VARCHAR(50)     NULL,
  英文影名 VARCHAR(80)     NULL,
  评分   FLOAT           NULL,
  导演   VARCHAR(60)     NULL,
  主演   VARCHAR(60)     NULL,
  时间   VARCHAR(50)      NULL,
  地区   VARCHAR(20)     NULL,
  类型   VARCHAR(15)     NULL,
  影评   VARCHAR(100)    NULL
)

数据库链接

self.db = pymysql.Connect(host='127.0.0.1', port=3306, user='root',passwd='密码', db='数据库名', charset='utf8')
self.cursor = self.db.cursor()

构建网页链接

self.base_url = 'https://movie.douban.com/top250?start='
self.url_list = [self.base_url + str(i) + "&filter=" for i in range(0, 226, 25) ]

请求及解析网页

for url in self.url_list:

        #请求
        req = requests.get(url, headers=self.headers)
        # 设置编码
        req.encoding = 'utf-8'
        #解析网页
        root = etree.HTML(req.content)

        texts = root.xpath('//ol/li/div[@class="item"]')

        for text in texts:
            # 初始化电影名
            Chinese_Name = ''
            English_Name = ''
            director = 'NULL'
            actor = 'NULL'

            # 排名
            rank = int(text.xpath('./div[@class="pic"]/em/text()')[0]) 
            # 链接
            movie_url = text.xpath('./div[@class="pic"]/a/@href')[0]
            # 电影名
            name = text.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
            # 导演和主演
            director_actor = text.xpath('./div[@class="info"]/div[@class="bd"]//p[@class]/text()')
            # 评分
            rating_num = text.xpath('./div[@class="info"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
            # 影评
            quote = text.xpath('./div[@class="info"]//p[@class="quote"]/span[@class="inq"]/text()')

代码

import requests
import pymysql
from lxml import etree

class DB250_spride(object):

    def __init__(self):
        self.base_url = 'https://movie.douban.com/top250?start='
        self.url_list = [self.base_url + str(i) + "&filter=" for i in range(0, 226, 25) ]
        self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
            }
        # 链接数据库
        self.db = pymysql.Connect(host='127.0.0.1', port=3306, user='root',passwd='密码', db='yw_test', charset='utf8')
        self.cursor = self.db.cursor()

    def Parse_url_and_InsertDB(self):

        for url in self.url_list:

            #请求
            req = requests.get(url, headers=self.headers)
            # 设置编码
            req.encoding = 'utf-8'
            #解析网页
            root = etree.HTML(req.content)

            texts = root.xpath('//ol/li/div[@class="item"]')

            for text in texts:

                # 初始化电影名
                Chinese_Name = ''
                English_Name = ''
                director = 'NULL'
                actor = 'NULL'


                # 排名
                rank = int(text.xpath('./div[@class="pic"]/em/text()')[0]) 
                # 链接
                movie_url = text.xpath('./div[@class="pic"]/a/@href')[0]
                # 电影名
                name = text.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
                # 导演和主演
                director_actor = text.xpath('./div[@class="info"]/div[@class="bd"]//p[@class]/text()')
                # 评分
                rating_num = text.xpath('./div[@class="info"]//div[@class="star"]/span[@class="rating_num"]/text()')[0]
                # 影评
                quote = text.xpath('./div[@class="info"]//p[@class="quote"]/span[@class="inq"]/text()')




                if len(name) == 2:
                    Chinese_Name = name[0]
                    English_Name = str(name[1]).replace('/', "")
                    English_Name = English_Name.replace("'", "''")
                    English_Name = English_Name.lstrip()
                else:
                    Chinese_Name = name[0]

                # 过滤字符
                director_actor[0] = director_actor[0].replace(' ', '')
                director_actor[1] = director_actor[1].replace(' ', '') 

                director_actor[0] = director_actor[0].replace('\xa0'*3, '  ')
                director_actor[1] = director_actor[1].replace('\xa0/\xa0', '  ')
                director_actor[0] = director_actor[0].replace('\n', '')
                print_length = len(director_actor[0]) + len(director_actor[1])+len(rating_num) + 20
                print("*"*print_length, end='\n\n')
                #print("排名:",rank,"中文影名:",Chinese_Name," 英文影名:", English_Name,"评分:", rating_num, end='\n')
                #print(director_actor[0],director_actor[1],sep='\n')
                print(rank)
                print(Chinese_Name)
                print(English_Name)
                print(rating_num)

                #提取导演名和主演名
                if len(director_actor[0].split('  ')) == 2:
                    director, actor = director_actor[0].split("  ")[0], director_actor[0].split("  ")[1]
                    actor = actor.replace("主演:", '')
                    actor = actor.replace("'", "''")    #字符串带有单引号,写入数据库要转义
                else:
                    director = director_actor[0].split("  ")[0]
                director = director.replace('导演:', '')
                director = director.replace("'", "''")  #字符串带有单引号,写入数据库要转义

                #提取时间 地区 类型
                year, location, types = director_actor[1].split('  ')[0], director_actor[1].split('  ')[1], director_actor[1].split('  ')[2]
                print(year)
                print(location)
                print(types)
                print(director)
                print(actor)
                # print(director_actor[0])
                # print(director_actor[1])
                # print(float(rating_num))

                #有些电影没有评论
                if quote:
                    print(quote[0])
                    quote = quote[0].replace("'", "''")
                else:
                    quote = 'NULL'

                print("*"*print_length, end='\n\n')

                ISSQL = "insert into yw_test.db250(排名, 中文影名, 英文影名, 评分, 导演, 主演, 时间, 地区, 类型, 影评) values\
                        (%d, '%s', '%s', %.1f, '%s','%s', '%s', '%s', '%s', '%s')"\
                        %(rank, Chinese_Name, English_Name, float(rating_num), director, actor, year, location, types, quote)

                try:
                    self.cursor.execute(ISSQL)
                    self.db.commit()
                except Exception as e:
                    self.db.rollback()
                    print(e)


    def CloseDB(self):
        self.cursor.close()
        self.db.close()

def main():
    spride1 = DB250_spride()
    spride1.Parse_url_and_InsertDB()



if __name__ == '__main__':
    main()