Python数据挖掘入门与实践-第9章-古腾堡计划网站书籍资料下载

由于python版本以及网站更新等原因,导致了书上的代码没有用了。因此自己试着修改了代码。

下面就来讲讲修改中遇到的主要问题

问题:网站URL变更

# 书上的代码
url_base = "http://www.gutenberg.myebook.bg/"

fixes[1044] = url_base + "1/0/4/1044/1044-0.txt"
fixes[5148] = url_base + "5/1/4/5148/5148-0.txt"
fixes[4657]="https://archive.org/stream/personalnarrativ04657gut/pnpa110.txt"

由于网站更新,现在应该使用以下URL

url_base = 'http://www.gutenberg.org/files/'
url_format = '{url_base}{id}/{id}-0.txt'

# 修复URL
url_fix_format = 'http://www.gutenberg.org/cache/epub/{id}/pg{id}.txt'

并且这里需要保存修复的URL,使用下面代码

from collections import defaultdict
fiexes = defaultdict(list)
# 这样就可以直接保存每个作者需要重新请求作品的URL
# 比如
fixex['twain'].append(bookid)

详细代码如下(亲测有效):

import requests
import os
import time
from collections import defaultdict

titles = {}


titles['burton'] = [4657, 2400, 5760, 6036, 7111, 8821,
                    18506, 4658, 5761, 6886, 7113]
titles['dickens'] = [24022, 1392, 1414, 1467, 2324, 580,
                     786, 888, 963, 27924, 1394, 1415, 15618,
                     25985, 588, 807, 914, 967, 30127, 1400,
                     1421, 16023, 28198, 644, 809, 917, 968, 1023,
                     1406, 1422, 17879, 30368, 675, 810, 924, 98,
                     1289, 1413, 1423, 17880, 32241, 699, 821, 927]
titles['doyle'] = [2349, 11656, 1644, 22357, 2347, 290, 34627, 5148,
                   8394, 26153, 12555, 1661, 23059, 2348, 294, 355,
                   5260, 8727, 10446, 126, 17398, 2343, 2350, 3070,
                   356, 5317, 903, 10581, 13152, 2038, 2344, 244, 32536,
                   423, 537, 108, 139, 2097, 2345, 24951, 32777, 4295,
                   7964, 11413, 1638, 21768, 2346, 2845, 3289, 439, 834]
titles['gaboriau'] = [1748, 1651, 2736, 3336, 4604, 4002, 2451,
                      305, 3802, 547]
titles['nesbit'] = [34219, 23661, 28804, 4378, 778, 20404, 28725,
                    33028, 4513, 794]
titles['tarkington'] = [1098, 15855, 1983, 297, 402, 5798,
                        8740, 980, 1158, 1611, 2326, 30092,
                        483, 5949, 8867, 13275, 18259, 2595,
                        3428, 5756, 6401, 9659]
titles['twain'] = [1044, 1213, 245, 30092, 3176, 3179, 3183, 3189, 74,
                   86, 1086, 142, 2572, 3173, 3177, 3180, 3186, 3192,
                   76, 91, 119, 1837, 2895, 3174, 3178, 3181, 3187, 3432,
                   8525]


assert len(titles) == 7

assert len(titles['tarkington']) == 22
assert len(titles['dickens']) == 44
assert len(titles['nesbit']) == 10
assert len(titles['doyle']) == 51
assert len(titles['twain']) == 29
assert len(titles['burton']) == 11
assert len(titles['gaboriau']) == 10


url_base = 'http://www.gutenberg.org/files/'
url_format = '{url_base}{id}/{id}-0.txt'

# 修复URL
url_fix_format = 'http://www.gutenberg.org/cache/epub/{id}/pg{id}.txt'

fiexes = defaultdict(list)
# fixes = {}
# fixes[4657] = 'http://www.gutenberg.org/cache/epub/4657/pg4657.txt'

# make parent folder if not exists
# data_folder = os.path.join(os.path.expanduser('~'),'Data','books') # 这是在用户user目录中存储
data_folder = os.path.join(os.path.abspath('.'), 'Data\\books') # 这样就可以在当前目录存储数据了
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
print(data_folder)

for author in titles:
    print('Downloading titles from', author)
    # make author's folder if not exists
    author_folder = os.path.join(data_folder, author)
    if not os.path.exists(author_folder):
        os.makedirs(author_folder)
    # download each title to this folder
    for bookid in titles[author]:
        # if bookid in fixes:
        #     print(' - Applying fix to book with id', bookid)
        #     url = fixes[bookid]
        # else:
        #     print(' - Getting book with id', bookid)
        #     url = url_format.format(url_base=url_base, id=bookid)

        url = url_format.format(url_base=url_base, id=bookid)
        print(' - ', url)
        filename = os.path.join(author_folder, '%s.txt' % bookid)
        if os.path.exists(filename):
            print(' - File already exists, skipping')
        else:
            r = requests.get(url)
            if r.status_code == 404:
                print('url 404:', author, bookid, 'add to fixes list')
                fiexes[author].append(bookid)
            else:
                txt = r.text
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(txt)
            time.sleep(1)
print('Download complete')


print('开始下载修复列表')
for author in fiexes:
    print('开始下载<%s>的作品' % author)
    author_folder = os.path.join(data_folder, author)
    if not os.path.exists(author_folder):
        os.makedirs(author_folder)

    for bookid in fiexes[author]:
        filename = os.path.join(author_folder, '%s.txt' % bookid)
        if os.path.exists(filename):
            print('文件已经下载,跳过')
        else:
            url_fix = url_fix_format.format(id=bookid)
            print(' - ', url_fix)
            r = requests.get(url_fix)
            if r.status_code == 404:
                print('又出错了!', author, bookid)
            else:
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(r.text)
            time.sleep(1)
print('修复列表下载完毕')
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值