python爬虫 request模块和Beautiful Soup解析进行数据爬取网站https://www.ptt.cc/bbs/NBA/index.html
request进行网页爬取
request网页爬取
import requests
url = "http://www.baidu.com"
r = request.get(url)
"""
r.status_code 查看响应码
r.encodfing 查看网页编码格式
r.text 打印返回源码
"""
Beautiful Soup进行网页提取
import requests ####、
from bs4 import BeautifulSoup
url = "https://www.ptt.cc/bbs/NBA/index.html"#设置所爬取的网站
#模仿浏览器发送请求User-Agent hesder请求头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
#获得响应response response接受html源码文件进行解析
response = requests.get(url, headers=headers)
# print(response.text)
# print(response.text)
#判断是否访问成功
if response.status_code == 200:#网页正常我访问返回200正常码
with open("output.html","w",encoding = "utf-8")as f: #创建一个output的html文件 文件格式方式乱码设置utf-8格式
f.write(response.text)#输出文本格式
print("写入成功")
else:
print("写入失败")
#创建BeautifulSoup对象所创建的html文件编码格式utf-8
b_soup = BeautifulSoup(open("output.html",encoding = "utf-8"), #打开文件 设置编码格式 解释html数据
"html.parser")
#
# print(b_soup.herf)
# print(b_soup.prettify())
poems_all = b_soup.find_all("div",class_= "title") #创建
for poem in poems_all:
print(poem)