爬取豆瓣top250

最新推荐文章于 2024-09-21 00:11:16 发布

流浪法师321

最新推荐文章于 2024-09-21 00:11:16 发布

阅读量86

点赞数

分类专栏： python 爬虫文章标签：程序人生

本文链接：https://blog.csdn.net/huoshi641/article/details/115124055

版权

python 爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

爬取豆瓣top250

- 导入模块
- 主题代码

导入模块

import requests
from lxml import etree
import csv

主题代码

class Douban:
	'''
	爬取豆瓣top250，并写入csv 文件
	'''
    def __init__(self):
        self.starturl = 'https://movie.douban.com/top250?start={}&filter='
    def run(self):
        movielist = []
        for i in range(10):
            pageurl = self.starturl.format(i*25)
            # print(pageurl)
            text=self.get_text(pageurl)
        #     print(text)
            movielist += self.analysis_text(text)
            # print(movielist)
        self.writeData(movielist)
    def get_text(self,url):
        header = {
         "User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"
         }
        response = requests.get(url,headers=header)
        response.encoding="utf-8"
        return response.text
    def analysis_text(self,text):
        element=etree.HTML(text)
        movieItemlist = element.xpath('//div[@class="info"]')
        # print(movieItemlist)
        movielist = []
        for eachmovie in movieItemlist:
            movieDict = {}

            title = eachmovie.xpath('./div[@class="hd"]/a/span[@class="title"]/text()')
            # print(title)
            move_link = eachmovie.xpath('div[@class="hd"]/a/@href')
            othertitle = eachmovie.xpath('div[@class="hd"]/a/span[@class="other"]/text()')

            movieDict['title']=''.join(title + othertitle)
            movieDict['url'] = move_link

            movielist.append(movieDict)
        return movielist
    def writeData(self,movielist):
        with open('douban.csv','w',encoding='utf-8',newline='')as f:
            writer = csv.DictWriter(f,fieldnames=['title','url'])
            writer.writeheader() #写入表头
            for each in movielist:
                writer.writerow(each)

if __name__ == '__main__':
    db = Douban()
    db.run()