import urllib.request,re
def download_html(url):
header = {
"User-Agent": "Mozilla/5.0(Windows NT 10.0;WOW64"
"AppleWebKit/53.36(KHTML,like Gecko)"
"Chrome/76.0.3809.87 Safari/537.36 SLBrowser/6.0.1.8131"
}
req=urllib.request.Request(url=url,headers=header)
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
return html
def extract_url(html):
pattern = "https://movie.douban.com/subject/[0-9]+/"
# 从html中提取全部满足电影介绍页网址pattern的url
urls = re.findall(pattern, html)
# 去除页面当中重复的url
return set(urls)
file=open('H://WorkPlace//PyWorkPlace//test1//douban.txt','r')
output=open('H://WorkPlace//PyWorkPlace//test1//movie.txt','w')
#
lines=file.readlines()
for url in lines:
#遍历每一行url,通过strip函数去掉收尾空字符
url=url.strip()
print(url)
html=download_html(url)
urls=extract_url(html)
for url in urls:
output.write(url+'\n')
#关闭俩个文件
file.close()
output.close()
测试效果展示:


被折叠的 条评论
为什么被折叠?



