HTML knowledge (learned through crawlers)

How browsers work

在这里插入图片描述

HTML在这里插入图片描述
CSS在这里插入图片描述
JavaScript在这里插入图片描述
静态数据
import requests
from bs4 import BeautifulSoup
import pandas as pd

pd.set_option("display.max_columns",None) # 显示全部的列

url = "https://book.douban.com/"
res = requests.get(url)
html = res.text
# html

soup = BeautifulSoup(html,"html.parser")
ul = soup.find("ul",class_="list-col list-col2 list-summary s")
li_all = ul.find_all("li")

data = []
for li in li_all:
    row = []
    书名 = li.find("h4").text.strip()  # 别忘了去掉空格!                # 书名
    评分 = li.find("span",class_="average-rating").text.strip()         # 评分
    星级 = div4.find("span",{"class":"star-img"})["class"][0][-2:]      # 星级
    作者 = li.find("p",class_="author").text.split(":")[1].strip()     # 作者
    # 作者 = li.find("p",class_="author").text.strip()[3:] # 作用同上
    类型 = li.find("p",class_="book-list-classification").text.strip()  # 类型
    评论 = li.find("p",class_="reviews")                                # 评论
    if 评论:   # 如果有评论
        评论 = li.find("p",class_="reviews").text.strip() 
    else:
        评论 = None
#     print(Name,Grade,Author,Book_type,Comment)
    row.append(书名)
    row.append(评分)
    row.append(星级)
    row.append(作者)
    row.append(类型)
    row.append(评论)
    data.append(row)

df = pd.DataFrame(data = data,columns = ["书名","评分","星级","作者","类型","评论"])
df

在这里插入图片描述

动态数据
import matplotlib.pyplot as plt # plt 用于显示图片
import matplotlib.image as mpimg # mpimg 用于读取图片
import numpy as np

# url
url = "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2563546656.jpg"  # 图片网址
#  请求
res = requests.get(url)

# 获取图片内容,保存到本地
filename = url.split("/")[-1]      # 获取图片网址中的本图代码(p2563546656),作为文件名
# filename
with open(filename, 'wb') as f:    # 读、写图片到本地
    f.write(res.content)
    photo = mpimg.imread(filename) # 读入图片
    plt.imshow(photo)              # 显示图片
    plt.axis('off')                # 去掉坐标轴

在这里插入图片描述

多页
_balist.find_all("li")

    data = []       # 存储所有的帖子
    for li in ul_newlist:
        row = []  # 存储每个帖子
        yuedu = li.find_all("cite")[0].text.strip()         # 阅读
        row.append(yuedu)
        pinglun = li.find_all("cite")[1].text.strip()       # 评论
        row.append(pinglun)
        tieba_a = li.find("a", class_="balink")             # 贴吧
        if tieba_a:
            tieba = tieba_a.text
        else:
            tieba = None
        row.append(tieba)
        note = li.find("a", class_="note").text         # 帖子标题
        row.append(note)
        author = li.find("font").text                   # 帖子作者
        row.append(author)
        date = li.find("cite", class_="last").text      # 帖子发布时间
        row.append(date)

        data.append(row)
        # print(row)

    columns = ["阅读","评论","贴吧","标题","作者","更新时间"]
    df = pd.DataFrame(data=data, columns=columns)
    return df

df_all = pd.DataFrame()
for i in range(1, 13):
    print("-----------------第%d页-----------------" % i)
    df = get_page(i)
    print(df)
    df_all = df_all.append(df)  # 将每页连接起来
    time.sleep(random.randint(1, 9))

print(df_all.shape)

# 将爬取的内容保存到.csv文件中
df_all.to_csv("贴吧评论.csv")  # 导出的文件改编码为ANSI,变为Excel编码(参文件:贴吧评论(Excel).csv)

在这里插入图片描述

多级
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

pd.set_option("display.max_columns", None)  # 显示全部的列

# ==============================第一级================================ #
# url
url = "https://movie.douban.com/"
# get请求
res = requests.get(url)
# html
html = res.text

# 构造BeautifulSoup并解析
soup = BeautifulSoup(html, "html.parser")
# 先找到 div#billboard
# div = soup.find("div", id="billboard")
div = soup.find("div", attrs={"id": "billboard"})  # 与上一句等价
# 在向下找到table表格
table = div.find("table")
# 再找到表格下面所有的tr标签
tr_all = table.find_all("tr")

# 遍历,提取每一行的三个字段
data = []
for tr in tr_all:
    row = []
    order = tr.find("td", class_="order").text     # 排行
    a = tr.find("a")
    title = a.text                                 # 名称
    # movie_url = a["href"]
    # movie_url = a.attrs["href"]                  # 链接
    movie_url = a.attrs.get("href")   # 推荐这种方式:如果属性为空,返回None
    ID = a.attrs.get("href")[-9:-1]                # ID

    row.append(order)
    row.append(title)
    row.append(movie_url)
    row.append(ID)
    data.append(row)
    # print(order, title, movie_url)

# 构造一个DataFrame
df = pd.DataFrame(data=data, columns=["排名", "电影名称", "url","ID"])
# print(df)

# ==============================第二级================================ #
# 爬取电影详情(二级页面爬取)
all_url = df.iloc[:,-2]  # 汇集各电影的url,放入all_url中
# print(all_url)

# 定义一个函数,爬取一部电影
def get_one_movie(movie_url):
    # 请求
    content = requests.get(movie_url).text   # 第二级,也是通过超链接发出请求

    # 构造一个BeautifulSoup实例
    soup = BeautifulSoup(content, "html.parser")
    # 查找 div.subjectwrap
    div_subjectwrap = soup.find("div", class_="subjectwrap")
   
    # 1)查找电影信息
    div_info = div_subjectwrap.find("div", id="info")
   
    # 导演
    span_dy = div_info.find("span", class_="pl", text="导演")  # 通过两个属性,定位到位置
    director = span_dy.find_next_sibling().text  # .find_next_sibling():查找相同级别的“兄弟”
    # 编剧
    span_writer = div_info.find("span", class_="pl", text="编剧")
    writer = span_writer.find_next_sibling().text
    # 主演
    span_actor = div_info.find("span", class_="pl", text="主演")
    actor = span_actor.find_next_sibling().text
    # 类型
    span_genre = div_info.find("span", class_="pl", text="类型:")
    genres = span_genre.find_next_siblings("span", property="v:genre") # .find_next_siblings:多个“兄弟” 
    genre = ""
    for g in genres:
        genre += g.text + ","   # 文本拼接:把多个类型放在一个文本中,用“,”隔开

    # 2)查找评分信息
    div_self = div_subjectwrap.find("div", class_="rating_self")

    # 评分
    rate = div_self.find("strong").text  # 观察发现属性“strong”是评分的特有属性 
    # 星级
    star = div_self.find("div", class_="bigstar").get("class")[2][-2:]  
                                # 先定位到属性中有bigstar的,在获取其三个属性的第三个中的后两个数字,即评分 
    # 评论人数
    rater = div_self.find("span", property="v:votes").text
    print(director, writer, actor, genre, rate, star, rater)  

#     row.append(director)
#     row.append(writer)
#     row.append(actor)
#     row.append(genre)
#     row.append(rate)
#     row.append(star)
#     row.append(rater)
#     data.append(row)
# df_new = pd.DataFrame(data=data,columns=["排名", "电影名称", "url","ID","导演","编剧","主演","类型","评分","星级","评论数"] )
    
# 遍历每个url,分别请求
for url in all_url:
    print("----------------------------")
    get_one_movie(url)
    time.sleep(random.randint(1,9))

在这里插入图片描述

轮播图
import requests
from bs4 import BeautifulSoup

url = "https://movie.douban.com/"
html = requests.get(url).text

soup = BeautifulSoup(html,"html.parser")
div_screening = soup.find("div",id = "screening")
# div_bd = div_screening.find("div",class_="screening-bd")
# print(div_bd)  # 发现该页面设计的代码有问题,不规范→爬虫可能有困难

# 因为本身结构不规范,所以只能先把div_screening下所有的li全部爬取出来
# 然后根据li是否具有 data-ticket来进行过滤
# 凡是具有data-ticket属性的li,就包含了信息
li_all = div_screening.find_all("li",attrs={"class":"ui-slide-item"})

# 遍历所有的li
for item in li_all:
       # 判断是否属性 data-ticket
       if item.get("data-ticket"):   # 如果是轮播的li,就获取其以下属性
           # movie_title
           movie_title = item.get("data-title")             # 名称
           # directors
           movie_directors = item.get("data-directors")     # 导演
           # actors
           movie_actors = item.get("data-actors")           # 主演
           # durations
           movie_durations = item.get("data-duration")      # 时长
           # rate
           movie_rate = item.get("data-rate")               # 评分
           # data-rater
           movie_rater = item.get("data-rater")             # 评论人数
           # data-region
           regions = item.get("data-region")                # 地区
           # data-release
           releases = item.get("data-release")              # 发行时间
           # data-star="35"
           movie_star = item.get("data-star")               # 星级
           # movie_id
           movie_id = item.get("data-ticket").split("=")[1] # ID
        
           print(movie_id,movie_title, movie_actors, movie_directors, movie_durations, movie_rate, movie_rater,regions,releases,movie_star)

在这里插入图片描述

selenium
# import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

# 定义要请求的url
url = "http://guba.eastmoney.com/"

# 请求
# res = requests.get(url)

# 执行下面代码前,确保已经将驱动添加到了PATH 路径上
# browser = webdriver.Chrome("./driver/chromedriver.exe")
browser = webdriver.Chrome("chromedriver.exe")
browser.get(url)
html = browser.page_source

# 浏览
# print(res.text)

# 构造BeautifulSoup对象实例
soup = BeautifulSoup(html, "html.parser")

# 先找到 div.balist
# div_balist = soup.find("div", attrs={"class":"balist"})
div_balist = soup.find("div", class_="balist")   # 与上一句等价

# <ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">
ul_newlist = div_balist.find_all("li")

data = []       # 存储所有的帖子
for li in ul_newlist:
    row = []  # 存储每个帖子
    yuedu = li.find_all("cite")[0].text.strip()         # 阅读
    row.append(yuedu)
    pinglun = li.find_all("cite")[1].text.strip()       # 评论
    row.append(pinglun)
    tieba_a = li.find("a", class_="balink")             # 贴吧
    if tieba_a:
        tieba = tieba_a.text
    else:
        tieba = None
    row.append(tieba)
    note = li.find("a", class_="note").text         # 帖子标题
    row.append(note)
    author = li.find("font").text                   # 帖子作者
    row.append(author)
    date = li.find("cite", class_="last").text      # 帖子发布时间
    row.append(date)

    data.append(row)
    # print(row)

columns = ["阅读","评论","贴吧","标题","作者","更新时间"]
df = pd.DataFrame(data=data, columns=columns)

print(df)

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值