#抓取PTT电影版的网页原始码(HTML)
import urllib.request as req
url=“https://www.ptt.cc/bbs/movie/index.html”
#建立一个Request物件,附加Request Headers 的资讯 http://www.yesesport.com/
request=req.Request(url,headers={
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36”
})
with req.urlopen(request) as response:
data=response.read().decode(“utf-8”)
#资料解析
import bs4
root=bs4.BeautifulSoup(data,“html.parser”)
titles=root.find_all(“div”,class_=“title”)
print(titles)
服务器拒绝爬虫403
最新推荐文章于 2022-04-20 14:23:52 发布