1. code
import requests
from lxml import html
from lxml import etree
from lxml.etree import ParserError
from time import sleep
import xlwt
import time
import re
#把取出的数据放到数据框中
from pandas import DataFrame
class MyBlog:
def __init__(self, blogNm):
headers = {"User-Agent":"Python-urllib/2.6"}
self.session = requests.Session()
self.session.headers.update(headers)
self.blogNm = blogNm
def get_categorys(self):
res = self.session.get(f"https://blog.csdn.net/{self.blogNm}")
category_list = []
#from lxml import html
'''
try:
assert res.status_code == 200
dom = html.document_fromstring(res.text)
except AssertionError:
raise AssertionError("get categorys failed.")
except ParserError as e:
raise ParserError(e)
category_urls = dom.xpath('//*[@id="asideCategory"]//*[@class="clearfix"]/@href')
category_names = dom.xpath('//*[@id="asideCategory"]//*[@class="title oneline"]/span/text()')
'''
'''
#from lxml import etree
root = etree.HTML(res.text)
category_urls = root.xpath('//*[@id="asideCategory"]//*[@class="clearfix"]/@href')
category_names = root.xpath('//*[@id="asideCategory"]//*[@class="title oneline"]/span/text()')
for index, url in enumerate(category_urls, 0):
temp_dict = {"categoryName":category_names[index].strip(), "url":url}
category_list.append(temp_dict)
print(category_names)
print(category_list)
#把取出的数据放到数据框中,方便处理
category_info = DataFrame([category_names, category_urls]).T
#设置表头
category_info.columns=["title","url"]
print(category_info)
#展示前几条数据
print(category_info.head(10))
#保存到本地
category_info.to_csv("category_info.csv")
'''
#正儿表达式匹配
url_pattern = '<a class="clearfix" target="_blank" href="(.*?)"'
category_urls = re.findall(url_pattern, res.text)
print(category_urls)
name_pattern = '<span class="text">(.*?)</span>'
category_names = re.findall(name_pattern, res.text)
print(category_names)
return category_list
def get_blog_with_category(self, category_url):
blog_list = []
count = 0
res = self.session.get(category_url)
try:
assert res.status_code == 200
dom = html.document_fromstring(res.text)
except AssertionError:
raise AssertionError("get category detail failed.")
except ParserError as e:
raise ParserError(e)
blog_urls = dom.xpath('//*[@class="column_article_list"]//a/@href')
blog_names = dom.xpath('//*[@class="column_article_list"]//*[@class="column_article_title"]//h2/text()')
blog_names_update = []
for name in blog_names:
name = name.strip()
if name:
blog_names_update.append(name)
try:
for index, url in enumerate(blog_urls, 0):
temp_dict = {"blogName":blog_names_update[index], "url":url}
blog_list.append(temp_dict)
count +=1
except IndexError:
raise IndexError("Name and URL are inconsistent.")
blogs_dict = {"count":count,
"blogs":blog_list}
return blogs_dict
def get_all_blogs(self):
blogs = []
count = 0
categorys = self.get_categorys()
for category in categorys:
blog_dict = self.get_blog_with_category(category["url"])
temp_dict = {"categoryName":category["categoryName"],
"elements":blog_dict
}
blogs.append(temp_dict)
count += blog_dict["count"]
sleep(1)
blogs_dict = {"blogs":blogs,
"count":count}
return blogs_dict
def set_style(name,height,bold=False):
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = name # 'Times New Roman'
font.bold = bold
font.color_index = 4
font.height = height
# borders= xlwt.Borders()
# borders.left= 6
# borders.right= 6
# borders.top= 6
# borders.bottom= 6
style.font = font
# style.borders = borders
return style
def write_excel(blogs):
file_name = "blog_" + time.strftime("%Y%m%d%H%M%S") +".xls"
#创建文件
f = xlwt.Workbook()
#创建sheet
sheet1 = f.add_sheet(u'blog',cell_overwrite_ok=True)
row0 = [u'分类',u'博客',u'url']
for i in range(0,len(row0)):
sheet1.write(0,i,row0[i],set_style('Times New Roman',220,True))
#写入内容
row = 1
for category in blogs["blogs"]:
categoryName = category["categoryName"]
for i in range(category["elements"]["count"]):
blogName = category["elements"]["blogs"][i]["blogName"]
blogUrl = category["elements"]["blogs"][i]["url"]
info = [categoryName, blogName, blogUrl]
for j in range(len(row0)):
sheet1.write(row, j, info[j])
row +=1
#保存excel
f.save(file_name)
if __name__ == "__main__":
my = MyBlog("shitou987")
my.get_categorys()
# blogs_dict = my.get_all_blogs()
# print(blogs_dict)
# write_excel(blogs_dict)
# 爬取多页信息
# 1.找翻页规律,构造url (url中可能包含页码)
# 2.通过循环爬取多页信息
2. 数据结构
{
'blogs': [{
'categoryName': 'python',
'elements': {
'count': 39,
'blogs': [{
'blogName': 'Python多线程&进程&协程',
'url': 'https://blog.csdn.net/shitou987/article/details/108460536'
}, {
'blogName': 'python jsonschema使用',
'url': 'https://blog.csdn.net/shitou987/article/details/107582352'
}, {
'blogName': '列表排序',
'url': 'https://blog.csdn.net/shitou987/article/details/107548825'
}, {
'blogName': '算法——递归',
...
],
'count': 88
}
3. 写入excel
4. DataFrame
保存为csv文件