Python爬虫小练习1

import re
from time import sleep
import requests
from lxml import etree
import random
import csv
def main(page, f):
    url = f'https://movie.douban.com/top250?start={page * 25}&filter='
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36', }
    resp = requests.get(url, headers=headers)
    tree = etree.HTML(resp.text)
    href_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/@href')
    name_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
    for url, name in zip(href_list, name_list):
        f.flush()
        try:
             get_info(url, name,)
        except:
            pass
        sleep(1 + random.random())
    print(f'第{i + 1}页爬取完毕')
def get_info(url, name):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36',
        'Host': 'movie.douban.com', }
    resp = requests.get(url, headers=headers)
    html = resp.text
    tree = etree.HTML(html)
    dir = tree.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
    type_ = re.findall(r'property="v:genre">(.*?)</span>', html)
    type_ = '/'.join(type_)
    country = re.findall(r'地区:</span> (.*?)<br', html)[0]
    time = tree.xpath('//*[@id="content"]/h1/span[2]/text()')[0]
    time = time[1:5]
    rate = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
    people = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
    print(name, dir, type_, country, rate, )
    csvwriter.writerow((name, dir, type_, country, rate,))
if __name__ == '__main__':
       with open('学号_姓名.csv', 'a', encoding='utf-8', newline='') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerow(('电影名称', '导演', '电影类型', '国家', '评分', ))
        for i in range(0,10):
            main(i, f)
            sleep(3 + random.random())

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值