利用正则表达式爬取笑话————来自一个朋友的启示

import requests
from bs4 import BeautifulSoup
import os
import shutil

start = os.getcwd()


def getHTMLtext(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'gbk'
        return r.text
    except:
        return ""


def list2txt(title_list, content_list, i):
    global start
    untitle = '?!@#$%^&*()+<>,.:";'

    os.chdir(start)
    dn = str(i)  # dirname
    fn = '?'
    if os.path.exists(dn):
        shutil.rmtree(dn)
    os.mkdir(dn)
    os.chdir(dn)

    for n, ff in enumerate(title_list):
        for t in untitle:
            ff = ff.replace(t, "")

        fn = ff + '.txt'
        print(fn)
        for temp in content_list[n]:
            if temp == '\n':
                content_list[n].remove(temp)
        joke = ((str(content_list[n])).replace("<p>", "") \
                .replace("</p>", "").replace('[', "").replace(']', "").replace(u'\xa0', u''))

        # print(fn)
        # print(len(content_list))
        with open(fn, "w") as f:
            # f.write(joke[::2]if joke[0]=='\n' else joke[::1])
            f.write(joke)  # \n待清除
            f.write('\n\n')
            f.close()
    os.chdir(start)


def get_list(soup, i):
    content_list = []
    title_list = []
    data_all = soup.find('ul', {'class': "article-list"})
    for a in data_all:
        data = data_all.find_all('li', {'class': "article-summary"})
        for b in data:
            # <span class="article-title"><a target="_blank" href="/detail60/59045.html">闲侃男女,笑语连珠</a></span>
            data_title = b.find('span', {'class': "article-title"})
            if data_title.string in title_list:
                continue
            title_list.append(data_title.string)
            data_content = b.find('div', {'class': "summary-text"})
            if data_content.contents in content_list:
                continue
            content_list.append(data_content.contents)
    list2txt(title_list, content_list, i)


def main():
    # https://xiaohua.zol.com.cn/new/2.html
    base = "https://xiaohua.zol.com.cn/new/"
    n = 5
    # n=eval(input('请输入要爬多少页\n'))
    for i in range(1, n):
        url = base + str(i) + '.html'
        html = getHTMLtext(url)
        # 获取每一页的笑话 标题 存入TXT
        soup = BeautifulSoup(html, "html.parser")
        # 建立文件夹 保存TXT
        get_list(soup, i)


main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值