案例一:爬取豆瓣网评分最高的250部电影
import requests
import bs4
start = 0
result = []
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.84 Safari/537.36 "
}
for i in range(0, 10):
res = requests.get("https://movie.douban.com/top250?start=" + str(start) + "&filter=", headers=header)
res.encoding = "utf-8"
start += 25
soup = bs4.BeautifulSoup(res.text, "html.parser")
for each in soup.find_all("div", "info"):
title = each.div.a.span.string
yearline = each.find("div", class_="bd").p.contents[2].string
yearline = yearline.replace("\n", "")
yearline = yearline.replace(" ", "")
year = yearline[0:4]
rating = each.find("span", {"class": "rating_num"}).string
oneresult = [title, rating, year]
result.append(oneresult)
print(result)
案例一(改进):爬取豆瓣网评分最高的250部电影,并放到Excel文件里
import requests
import bs4
import openpyxl
start = 0
result = []
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.84 Safari/537.36 "
}
for i in range(0, 10):
res = requests.get("https://movie.douban.com/top250?start=" + str(start) + "&filter=", headers=header)
res.encoding = "utf-8"
start += 25
soup = bs4.BeautifulSoup(res.text, "html.parser")
for each in soup.find_all("div", "info"):
title = each.div.a.span.string
yearline = each.find("div", class_="bd").p.contents[2].string
yearline = yearline.replace("\n", "")
yearline = yearline.replace(" ", "")
year = yearline[0:4]
rating = each.find("span", {"class": "rating_num"}).string
oneresult = [title, rating, year]
result.append(oneresult)
def to_excel(result):
wb = openpyxl.Workbook()
wb.guess_type = True
ws = wb.active
ws.append(["电影名", "评分", "出版年份"])
for each in result:
ws.append(each)
wb.save("豆瓣TOP250.xlsx")
def main():
to_excel(result)
if __name__ == "__main__":
main()
案例二:爬取2017年城市的平均房价,并生成Excel ==》主要是函数的封装
正则表达式的使用
import requests
import bs4
import re
import openpyxl
def open_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.84 Safari/537.36 "
}
res = requests.get(url, headers=headers)
return res
def find_data(res):
data = []
soup = bs4.BeautifulSoup(res.text, "html.parser")
content = soup.find(id="Cnt-Main-Article-QQ")
target = content.find_all("p", style="TEXT-INDENT: 2em")
target = iter(target)
for each in target:
if each.text.isnumeric():
data.append([
re.search(r"\[(.+)\]", next(target).text).group(1),
re.search(r"\d.*", next(target).text).group(),
re.search(r"\d.*", next(target).text).group(),
re.search(r"\d.*", next(target).text).group()
])
return data
def to_excel(data):
wb = openpyxl.Workbook()
wb.guess_types = True
ws = wb.active
ws.append(["城市", "平均房价", "平均工资", "房价工资比"])
for each in data:
ws.append(each)
wb.save("2017年中国主要城市房价工资比排行榜.xlsx")
def main():
url = "https://news.house.qq.com/a/20170702/003985.htm"
res = open_url(url)
data = find_data(res)
to_excel(data)
if __name__ == "__main__":
main()
案例三:爬取网易云音乐的某首歌的热评
1. 从服务器获取数据需要用到 Form Data 的参数数据
2. 得到的数据是json的形式要用json来解析
import requests
import json
def get_hot_comments(res):
file_name = input("请输入文件名:")
comments_json = json.loads(res.text)
hot_comments = comments_json["hotComments"]
with open("{}热评.txt".format(file_name), "w", encoding="utf-8") as file:
for each in hot_comments:
file.write(each["user"]["nickname"] + ":\n\n")
file.write(each["content"] + ":\n")
file.write("---------------------------------\n")
def get_comments(url):
name_id = url.split("=")[1]
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.84 Safari/537.36",
"referer": "https://music.163.com/song?id=4466775"
}
params = "snLPgVRqxAY1s1oq0iWZkjs0Sd7r5vVJ1vekAKqEVMk3R8kovMyjr3s/wjSTiIFl8qWtDjd5aDVYhgmoV9v+IINcoj" \
"+qWJDgxCj6HixHkFhgjvrfHkNHXK2OjKa41pqKmdfLFwBNwQKdB8u0xV5EGyYy1Pb6xgB6l+M1ZVFmcdf0fnvm/iNxmWwYS" \
"+hbZwT4tfzcPI3wQrnoiKmrBaSzS40Qq9aD2PPePssPr9qK+pqPNu3WCrEJeg2qzJje6V6AuzDqKjCJKrkT2O0GOZkwFQ== "
encSecKey = "2f0f8a650977ec77083d50e09834dc0eda42d95311fb2623338f154e7e77db9ccc4250034777db85470cd3e682d2851f96f95b1441590857dddafbbb0d7f5ee3659c5ecebd0c9a0824ca19cb38376c958614d17c5ddd4f0bd4bdc973032d2e90c96015889691a5210c03d55ee573f47ffd01527ccc1f6523a95d41ea4191338b "
data = {
"params": params,
"encSecKey": encSecKey
}
target_url = "https://music.163.com/api/v1/resource/comments/R_SO_4_{}?csrf_token=".format(name_id)
res = requests.post(target_url, headers=headers, data=data)
return res
def main():
url = input("请输入链接地址:")
res = get_comments(url)
get_hot_comments(res)
if __name__ == "__main__":
main()
案例四:爬取百度图片
import requests
import os
file_name = "图片\\"
if not os.path.exists(file_name):
os.mkdir(file_name)
def get_url(url, headers):
res = requests.get(url, headers=headers)
return res
def get_pic(res, headers):
pic = res.json()["data"]
for i in range(29):
pic_url = pic[i]["hoverURL"]
pic_res = requests.get(pic_url, headers=headers)
with open("{}\\".format(file_name)+str(i)+".jpeg", "wb") as file:
file.write(pic_res.content)
def main():
url = "{}".format(input("请输入下载的地址(需要网页Header里面的URL):"))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.84 Safari/537.36 "
}
res = get_url(url, headers)
get_pic(res, headers)
print("下载完毕。")
if __name__ == "__main__":
main()
案例五:爬取网易云音乐
import requests
import re
import os
file_name = "eMusic\\"
if not os.path.exists(file_name):
os.mkdir(file_name)
url = "https://music.163.com/discover/toplist?id=3778678"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/99.0.4844.84 Safari/537.36 "
}
res = requests.get(url=url, headers=headers)
html_data = re.findall('<li><a href="/song\?id=(\d+)">(.*?)</a>', res.text)
for num_id, title in html_data:
music_url = f"http://music.163.com/song/media/outer/url?id={num_id}.mp3"
music_content = requests.get(url=music_url, headers=headers).content
with open(file_name+title+".mp3", mode="wb") as file:
file.write(music_content)
print(num_id, title)
print("下载完毕!")