加User-Agent 为了防止403
加ViewMode=contents是为了减少页面传输,不加载文章内容摘要,可以删掉
# coding=utf-8
import urllib.request
from urllib.parse import quote
from bs4 import BeautifulSoup
def get_article_all():
url = "http://blog.csdn.net/bwlab/article/list/100000"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Cookie':'ViewMode=contents;',
}
request= urllib.request.Request(url,headers=headers)
html = urllib.request.urlopen(request).read().decode("utf-8")
soup = BeautifulSoup(html,"html.parser")
article_all = soup.select('div[class="list_item list_view"]')
for article in article_all:
print("title:", article.select('span[class="link_title"]')[0].get_text().strip())
print("url:", article.a["href"])
print("date:", article.select('span[class="link_postdate"]')[0].get_text().strip())
print("view:", article.select('span[class="link_view"]')[0].get_text().strip().replace("阅读(","").replace(")",""))
print("comments:",article.select('span[class="link_comments"]')[0].get_text().strip().replace("评论(", "").replace(")", ""))
if __name__ =="__main__":
get_article_all()