直接贴代码
#!/usr/local/bin/python3.5
# -*- coding:UTF-8 -*-
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
import csv
random.seed(datetime.datetime.now())
csvfile = open('test.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(csvfile)
def store(title, content):
csvrow = []
csvrow.append(title)
csvrow.append(content)
writer.writerow(csvrow)
def get_links(acticle_url):
html = urlopen('http://en.wikipedia.org' + acticle_url)
soup = BeautifulSoup(html, 'html.parser')
title = soup.h1.get_text()
content = soup.find('div', {'id': 'mw-content-text'}).find('p').get_text()
store(title, content)
return soup.find('div', {'id': 'bodyContent'}).findAll('a', href=re.compile("^(/wiki/)(.)*$"))
links = get_links('')
try:
while len(links) > 0:
newActicle = links[random.randint(0, len(links) - 1)].attrs['href']
links = get_links(newActicle)
print(links)
finally:
csvfile.close()