开始想着用IronPython库在C#里面直接执行python 方法 发现导包很多时候喜欢报错。到时候我用python 做一个web服务 直接调用接口
开始爬取博客园数据
爬博客园很简单 都是静态数据
思路。
1 爬取随笔分类 或许URL
2.逐个爬取分类。获取列表 url
3.爬取文章详情,下载图片
4,替换文章详情图片连接
上代码
import requests
import os
from pyquery import PyQuery as pq
def Request(url,data=""):
herder={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
req=requests.get(url,headers=herder,params=data)
if req.status_code==200:
return req.text
else:
return 0
def disfeilei(html):
doc=pq(html)
listfls=doc('#sidebar_postcategory').find('li a').items()
list=[]
for fl in listfls:
classfly={
'url':fl.attr('href'),
'name':fl.text().split('(')[0],
'count':fl.text().split('(')[1][0:-1]
}
list.append(classfly)
return list
def Getwzcon(url):
html=Request(url)
doc=pq(html)
con=doc('#main').find('#cnblogs_post_body')
imglist=con.find("img").items()
for i in imglist:
url= i.attr('src')
index=url.find('797834')+7
flit=url[index:]
#保存到项目文件
path='h:/。net学习/blogs/BLOGS/WebApplication1/images/blogs/'+flit
#dowimg(url,path)
#替换图片路径
i.attr('src',path)
print(type(con.html()))
return con.html()
def dowimg(url,path):
#获取目录
paths=os.path.dirname(path)
print(paths)
#目录是否存在
if os.path.exists(paths)==False:
os.makedirs(paths)
response = requests.get(url).content
with open(path,'wb')as f:
f.write(response)
print("文件下载成功")
def Getwenz(classfly):
html=Request(classfly)
doc=pq(html)
listwzs=doc('#main').find('.entrylist>.entrylistItem').items()
list=[]
for i in listwzs:
title=i.find('.entrylistItemTitle').text()
url=i.find('.entrylistItemTitle').attr('href')
desc=i.find('.c_b_p_desc').text()[0:-4]
entry=i.find('.entrylistItemPostDesc').text().split(" ")
datatime=entry[2]+" "+entry[3]
readcount=entry[5][3:-1]
#获取详情内容
content=Getwzcon(url)
#print(entry)
art={
'title':title,
'url':url,
'desc':desc,
'datatime':datatime,
'readcount':readcount,
'body':content
}
print(art)
return
if __name__ == '__main__':
url="https://www.cnblogs.com/ruogu/mvc/blog/sidecolumn.aspx"
data1={'blogApp':'ruogu' }
textfeilei=Request(url,data1)
if textfeilei!=0:
#获取所有分类
list_fly=disfeilei(textfeilei)
#遍历分类
for item in list_fly:
#添加数据库
#print(item)
#获取文章详情
Getwenz(item['url'])