BeautifulSoup

1.BeautifulSoup初始化

soup = BeautifulSoup(html, 'html.parser')

2.find和find_all

查找某个元素,返回一个结果

soup.find("标签名", attrs={"属性": "值"})

查找某个元素,返回所有符合条件结果

result = soup.find_all("标签名", attrs={"属性": "值"})

取文本

result.text

取属性

3.250电影排名实例

result.get("属性名")
import requests
from bs4 import BeautifulSoup
import pprint
import json
import pandas

from my_fake_useragent import UserAgent

page_indexs = range(0, 250, 25)
def down_htmls():
    htmls = []
    for idx in page_indexs:
        url = f"https://movie.douban.com/top250?start={idx}&filter"
        print("craw html:", url)
        headers = {'User-Agent': UserAgent().random()}
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            raise Exception("error")
        htmls.append(r.text)
    return htmls

htmls = down_htmls()

def extract_single_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    article_items = soup.find("div", class_="article").find("ol", class_="grid_view").find_all("li")
    datas = []
    for article_item in article_items:
        rank = article_item.find("em",).get_text()
        title = article_item.find("span", class_="title").get_text()
        stars = article_item.find("div", class_="star")
        rating_star = stars.find("span")["class"][0]
        rating_num = stars.find("span",class_="rating_num").get_text()
        comments_num = stars.find_all("span")[3].get_text()
        datas.append(
            {
                "rank":rank,
                "title":title,
                "rating_star":rating_star.replace("rating", "").replace("-t", ""),
                "rating_num":rating_num,
                "comments_num":comments_num
            }
        )
    return datas
# datas = extract_single_html(htmls[0])
# pprint(datas)
all_datas = []
for html in htmls:
    all_datas.extend((extract_single_html(html)))
df = pandas.DataFrame(all_datas)
df.to_excel("250rankMovies.xlsx")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值