import urllib.request
def get_html_text(url):
try:
h = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/68.0.3440.106 Safari/537.36'
}
r = requests.get(url, headers=h, timeout=3000)
r.raise_for_status() # 如果不是200,则引发HTTPError异常
r.encoding = r.apparent_encoding # 根据内容去确定编码格式
return r.text
except BaseException as e:
print("出现异常:", e)
return str(e)
# 将字符串写进文件中,参数分别是文件名和内容
def writefile(file_name, content_str):
with open(file_name, "w", encoding='utf-8', ) as f:
f.write(content_str)
f.close
# 爬虫代码
import requests
import re
from bs4 import BeautifulSoup
print("开始爬虫")
url = "https://movie.douban.com/top250"
html_text = get_html_text(url)
#writefile("a.txt", html_text)
soup = BeautifulSoup(html_text,"html.parser")
nodes=soup.find_all("span",class_="title")
nodes2=soup.find_all("span",class_="rating_num")
nodes3 = soup.find_all("img")
for i in nodes:
s = i.string
if s[1]=='/':
continue
else:
print(i.string)
for j in nodes2:
print(j.string)
for k in nodes3:
link = k["src"]
name = k["alt"]
print(name)
print(link)
urllib.request.urlretrieve(link,"C:/Users/416/Desktop/pics/%s.jpg" % name)
Python 课堂小作业top250
最新推荐文章于 2024-01-17 06:54:50 发布