简单爬虫

from urllib import request
import chardet
import re

def get_url_list():

    response = request.urlopen("https://movie.douban.com/")
    html = response.read()
    charset = chardet.detect(html)# {'language': '', 'encoding': 'utf-8', 'confidence': 0.99}
    # end = str(charset['encoding'])
    html = html.decode('utf-8')
    reg = r'<li    ><a href="(.*?)".*?>(.*?)</a>.*?</li>'
    result = re.findall(reg, html, re.S)
    return result

def get_text(url):
    response = request.urlopen(url)
    html = response.read()
    charset = chardet.detect(html)
    html = html.decode('utf-8')
    reg = r'<a class="nbg" href="(.*?)"  title="(.*?)">'
    result = re.findall(reg, html)
    # print(result)
    return result

def get_context(url):
    response = request.urlopen(url)
    html = response.read()
    charset = chardet.detect(html)
    html = html.decode('utf-8')
    reg = r' "description": "(.*?)".*?"ratingValue": "(.*?)"'
    result = re.findall(reg, html, re.S)[0]
    # print(result)
    return result

for url_list, url_title in get_url_list():
    if url_title == '排行榜':
        for novel_url, novel_title in get_text(url_list):
            novel_context, score = get_context(novel_url)
            novel_context = str(novel_title) + '\t' + str(score) +'\t' + novel_context + '\n'
            fn = open("./豆瓣排行榜.txt", "a+", encoding='utf-8')
            fn.write(novel_context)
            fn.close()

 

效果图

 

 

 

from urllib import request
import chardet
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

def get_url_list():

    response = request.urlopen("https://www.biquyun.com/0_452/")
    html = response.read()
    # charset = chardet.detect(html)# {'language': '', 'encoding': 'utf-8', 'confidence': 0.99}
    html = html.decode('gbk')  # 解码
    reg = r'<dd><a href="(.*?)">(.*?)</a></dd>'
    result = re.findall(reg, html, re.S)
    return result

def get_url_context(url):
    response = request.urlopen(url)
    response.encondeing = 'gbk'
    result = response.read()
    result = result.decode('gbk')
    reg = r'<div id="content">(.*?)</div>'
    context = re.findall(reg, result, re.S)[0]
    return context

for novel_url, novel_title in get_url_list():
    novel_url = 'https://www.biquyun.com' + novel_url
    print("正在保存  %s" % novel_title)
    chapter = get_url_context(novel_url)
    # print(novel_url, novel_title)
    fn = open("%s.html" % novel_title, "w", encoding='utf-8')
    fn.write(chapter)
    fn.close()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值