import urllib.request
import os
import re
def open_url(url):
req = urllib.request.Request(url)
#加上一个referer就成功了
req.add_header("Referer","https://movie.douban.com/top250?start=25&filter=")
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36")
response = urllib.request.urlopen(req)
html = response.read()
return html
def find_imgs(url):
html = open_url(url).decode("utf-8")
p = r'<span class=\"title\">([^&].+?)</span>'#[^&]用法:匹配不包含&的所有字符
img_addrs1 = re.findall(p,html)
return img_addrs1
def save(img_addrs):
with open("豆瓣top250.txt","w") as f:
#将字典形式写入txt文件的一种方法。
for each in img_addrs.items():
#print(each)
for i in each:
print(i)
j = str(i)
f.write(j)
f.write("\000")#加一个空格
f.write("\n")
f.close
def download(pages):#page可以自定义
os.chdir(r"E:\python汇总\pythonsource\爬取存放")#切换到该目录
#如果文件已存在就覆盖它
try:
os.mkdir("豆瓣top250")#创建新的目录
except FileExistsError:
pass
os.chdir("豆瓣top250")#切换到该目录
page_num = 0#int(get_page(url))
global img_addrs
img_addrs = []
for i in range(pages):
page_url = "https://movie.douban.com/top250?start="+ str(page_num)+ "&filter="
img_addrs1 = find_imgs(page_url)
for factor in img_addrs1:
img_addrs.append(factor)
#print(img_addrs)
page_num += 25
#print(img_addrs)
#加上标号,变为字典形式
num1 = [x for x in range(1,len(img_addrs)+1)]
img_addrs = {x:y for (x,y) in zip(num1,img_addrs)}
#img_addrs = list(img_addrs)
#print(img_addrs)
save(img_addrs)
if __name__=='__main__':
#单独运行或者说作为程序运行时才为真,当作为模块运行时则为假
url="https://movie.douban.com/top250?start=0&filter="
download(10)
获取豆瓣电影排名
最新推荐文章于 2024-05-08 15:12:03 发布