from bs4 import BeautifulSoup
import urllib
import urllib.request
def IsValidTitle(title):
blockList = [
'帝都',
'windows anaconda TypeError: LoadLibrary()',
'[Debug]',
'【Debug】',
'iOS'
]
for item in blockList:
if item.lower() in title.lower():
return False
return True
def FetchCSDNTitles(sourceUrl):
res = []
with urllib.request.urlopen(sourceUrl) as httpResponse:
htmlSource = httpResponse.read()
soup = BeautifulSoup(htmlSource, "html.parser")
tags = soup.find_all("h4")
for tag in tags:
urls = tag.find_all("a")
if (urls):
title = tag.get_text().strip('\t\n\r原创转载 ')
if (IsValidTitle(title) == True):
res.append('[{0}]({1})'.format(title, urls[0]['href']))
print(res)
return res
outputPath = r'E:\Temp\taoqick.txt'
with open(outputPath, 'w', encoding='utf-8') as outputFile:
for i in range(1, 24):
print('-----------------------------------------\nCrawling '+'https://blog.csdn.net/taoqick/article/list/{0}'.format(i))
titles = FetchCSDNTitles('https://blog.csdn.net/taoqick/article/list/{0}'.format(i))
for title in titles:
outputFile.write(title + '\n')