How browsers work
HTML
CSS
JavaScript
静态数据
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option("display.max_columns",None)
url = "https://book.douban.com/"
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html,"html.parser")
ul = soup.find("ul",class_="list-col list-col2 list-summary s")
li_all = ul.find_all("li")
data = []
for li in li_all:
row = []
书名 = li.find("h4").text.strip()
评分 = li.find("span",class_="average-rating").text.strip()
星级 = div4.find("span",{"class":"star-img"})["class"][0][-2:]
作者 = li.find("p",class_="author").text.split(":")[1].strip()
类型 = li.find("p",class_="book-list-classification").text.strip()
评论 = li.find("p",class_="reviews")
if 评论:
评论 = li.find("p",class_="reviews").text.strip()
else:
评论 = None
row.append(书名)
row.append(评分)
row.append(星级)
row.append(作者)
row.append(类型)
row.append(评论)
data.append(row)
df = pd.DataFrame(data = data,columns = ["书名","评分","星级","作者","类型","评论"])
df
动态数据
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
url = "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2563546656.jpg"
res = requests.get(url)
filename = url.split("/")[-1]
with open(filename, 'wb') as f:
f.write(res.content)
photo = mpimg.imread(filename)
plt.imshow(photo)
plt.axis('off')
多页
_balist.find_all("li")
data = []
for li in ul_newlist:
row = []
yuedu = li.find_all("cite")[0].text.strip()
row.append(yuedu)
pinglun = li.find_all("cite")[1].text.strip()
row.append(pinglun)
tieba_a = li.find("a", class_="balink")
if tieba_a:
tieba = tieba_a.text
else:
tieba = None
row.append(tieba)
note = li.find("a", class_="note").text
row.append(note)
author = li.find("font").text
row.append(author)
date = li.find("cite", class_="last").text
row.append(date)
data.append(row)
columns = ["阅读","评论","贴吧","标题","作者","更新时间"]
df = pd.DataFrame(data=data, columns=columns)
return df
df_all = pd.DataFrame()
for i in range(1, 13):
print("-----------------第%d页-----------------" % i)
df = get_page(i)
print(df)
df_all = df_all.append(df)
time.sleep(random.randint(1, 9))
print(df_all.shape)
df_all.to_csv("贴吧评论.csv")
多级
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
pd.set_option("display.max_columns", None)
url = "https://movie.douban.com/"
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", attrs={"id": "billboard"})
table = div.find("table")
tr_all = table.find_all("tr")
data = []
for tr in tr_all:
row = []
order = tr.find("td", class_="order").text
a = tr.find("a")
title = a.text
movie_url = a.attrs.get("href")
ID = a.attrs.get("href")[-9:-1]
row.append(order)
row.append(title)
row.append(movie_url)
row.append(ID)
data.append(row)
df = pd.DataFrame(data=data, columns=["排名", "电影名称", "url","ID"])
all_url = df.iloc[:,-2]
def get_one_movie(movie_url):
content = requests.get(movie_url).text
soup = BeautifulSoup(content, "html.parser")
div_subjectwrap = soup.find("div", class_="subjectwrap")
div_info = div_subjectwrap.find("div", id="info")
span_dy = div_info.find("span", class_="pl", text="导演")
director = span_dy.find_next_sibling().text
span_writer = div_info.find("span", class_="pl", text="编剧")
writer = span_writer.find_next_sibling().text
span_actor = div_info.find("span", class_="pl", text="主演")
actor = span_actor.find_next_sibling().text
span_genre = div_info.find("span", class_="pl", text="类型:")
genres = span_genre.find_next_siblings("span", property="v:genre")
genre = ""
for g in genres:
genre += g.text + ","
div_self = div_subjectwrap.find("div", class_="rating_self")
rate = div_self.find("strong").text
star = div_self.find("div", class_="bigstar").get("class")[2][-2:]
rater = div_self.find("span", property="v:votes").text
print(director, writer, actor, genre, rate, star, rater)
for url in all_url:
print("----------------------------")
get_one_movie(url)
time.sleep(random.randint(1,9))
轮播图
import requests
from bs4 import BeautifulSoup
url = "https://movie.douban.com/"
html = requests.get(url).text
soup = BeautifulSoup(html,"html.parser")
div_screening = soup.find("div",id = "screening")
li_all = div_screening.find_all("li",attrs={"class":"ui-slide-item"})
for item in li_all:
if item.get("data-ticket"):
movie_title = item.get("data-title")
movie_directors = item.get("data-directors")
movie_actors = item.get("data-actors")
movie_durations = item.get("data-duration")
movie_rate = item.get("data-rate")
movie_rater = item.get("data-rater")
regions = item.get("data-region")
releases = item.get("data-release")
movie_star = item.get("data-star")
movie_id = item.get("data-ticket").split("=")[1]
print(movie_id,movie_title, movie_actors, movie_directors, movie_durations, movie_rate, movie_rater,regions,releases,movie_star)
selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
url = "http://guba.eastmoney.com/"
browser = webdriver.Chrome("chromedriver.exe")
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
div_balist = soup.find("div", class_="balist")
ul_newlist = div_balist.find_all("li")
data = []
for li in ul_newlist:
row = []
yuedu = li.find_all("cite")[0].text.strip()
row.append(yuedu)
pinglun = li.find_all("cite")[1].text.strip()
row.append(pinglun)
tieba_a = li.find("a", class_="balink")
if tieba_a:
tieba = tieba_a.text
else:
tieba = None
row.append(tieba)
note = li.find("a", class_="note").text
row.append(note)
author = li.find("font").text
row.append(author)
date = li.find("cite", class_="last").text
row.append(date)
data.append(row)
columns = ["阅读","评论","贴吧","标题","作者","更新时间"]
df = pd.DataFrame(data=data, columns=columns)
print(df)