from urllib import request
import chardet
import re
def get_url_list():
response = request.urlopen("https://movie.douban.com/")
html = response.read()
charset = chardet.detect(html)# {'language': '', 'encoding': 'utf-8', 'confidence': 0.99}
# end = str(charset['encoding'])
html = html.decode('utf-8')
reg = r'<li ><a href="(.*?)".*?>(.*?)</a>.*?</li>'
result = re.findall(reg, html, re.S)
return result
def get_text(url):
response = request.urlopen(url)
html = response.read()
charset = chardet.detect(html)
html = html.decode('utf-8')
reg = r'<a class="nbg" href="(.*?)" title="(.*?)">'
result = re.findall(reg, html)
# print(result)
return result
def get_context(url):
response = request.urlopen(url)
html = response.read()
charset = chardet.detect(html)
html = html.decode('utf-8')
reg = r' "description": "(.*?)".*?"ratingValue": "(.*?)"'
result = re.findall(reg, html, re.S)[0]
# print(result)
return result
for url_list, url_title in get_url_list():
if url_title == '排行榜':
for novel_url, novel_title in get_text(url_list):
novel_context, score = get_context(novel_url)
novel_context = str(novel_title) + '\t' + str(score) +'\t' + novel_context + '\n'
fn = open("./豆瓣排行榜.txt", "a+", encoding='utf-8')
fn.write(novel_context)
fn.close()
效果图
from urllib import request
import chardet
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def get_url_list():
response = request.urlopen("https://www.biquyun.com/0_452/")
html = response.read()
# charset = chardet.detect(html)# {'language': '', 'encoding': 'utf-8', 'confidence': 0.99}
html = html.decode('gbk') # 解码
reg = r'<dd><a href="(.*?)">(.*?)</a></dd>'
result = re.findall(reg, html, re.S)
return result
def get_url_context(url):
response = request.urlopen(url)
response.encondeing = 'gbk'
result = response.read()
result = result.decode('gbk')
reg = r'<div id="content">(.*?)</div>'
context = re.findall(reg, result, re.S)[0]
return context
for novel_url, novel_title in get_url_list():
novel_url = 'https://www.biquyun.com' + novel_url
print("正在保存 %s" % novel_title)
chapter = get_url_context(novel_url)
# print(novel_url, novel_title)
fn = open("%s.html" % novel_title, "w", encoding='utf-8')
fn.write(chapter)
fn.close()