0x01.描述
- CSDN自带数据分析功能,但毕竟功能有限,我们可以自己根据自己的需求来定制一些数据的分析,第一步,就是要获取数据,获取数据的种类有很多种,这里以左边小栏目的数据为例,使用的是BeautifulSoup。
0x02.Py思路
- 这里的思路是获取这两个div下的所有dl的title值,也就是我们需要的数据。
0x03.Py脚本
- 可以根据需要每隔多久爬取一次。
- 同时写入了此次爬取的时间。
import time
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen(r"https://blog.csdn.net/ATFWUS").read().decode('utf-8')
soup = BeautifulSoup(html, features="html.parser")
f = open("D://DeskTop/csdn.txt", 'w+', encoding='utf-8')
ltime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(ltime+"\n")
print(ltime)
while True:
span1 = soup.find("span", attrs={"class": "name"}).get_text()
span2 = soup.find("span", attrs={"class": "personal-home-page"}).get_text()
print(span1.strip())
print(span2.strip())
f.write(span1.strip()+"\n")
f.write(span2.strip()+"\n")
title=soup.title.string
f.write(title.strip() + "\n")
tags = soup.find_all("div",attrs={"class":"data-info d-flex item-tiling"})
for tag in tags:
for i in range(5):
t=tag.select('dl')[i]
data=t.get('title')
print(data)
f.write(data+"\n")
f.close()
exit(0)
0x04.Java对数据进行处理
实体类:
import java.util.Date;
public class CSDN_datas {
private Date date;
private String name;
private String codeAge;
private String introduction;
private Integer yc;
private Integer fans;
private Integer zan;
private Integer content;
private Integer cust;
private Integer jf;
private Integer sc;
private Integer WeekGrade;
private Integer TotalGrade;
private Integer Grade;
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCodeAge() {
return codeAge;
}
public void setCodeAge(String codeAge) {
this.codeAge = codeAge;
}
public String getIntroduction() {
return introduction;
}
public void setIntroduction(String introduction) {
this.introduction = introduction;
}
public Integer getYc() {
return yc;
}
public void setYc(Integer yc) {
this.yc = yc;
}
public Integer getFans() {
return fans;
}
public void setFans(Integer fans) {
this.fans = fans;
}
public Integer getZan() {
return zan;
}
public void setZan(Integer zan) {
this.zan = zan;
}
public Integer getContent() {
return content;
}
public void setContent(Integer content) {
this.content = content;
}
public Integer getCust() {
return cust;
}
public void setCust(Integer cust) {
this.cust = cust;
}
public Integer getJf() {
return jf;
}
public void setJf(Integer jf) {
this.jf = jf;
}
public Integer getSc() {
return sc;
}
public void setSc(Integer sc) {
this.sc = sc;
}
public Integer getWeekGrade() {
return WeekGrade;
}
public void setWeekGrade(Integer weekGrade) {
WeekGrade = weekGrade;
}
public Integer getTotalGrade() {
return TotalGrade;
}
public void setTotalGrade(Integer totalGrade) {
TotalGrade = totalGrade;
}
public Integer getGrade() {
return Grade;
}
public void setGrade(Integer grade) {
Grade = grade;
}
@Override
public String toString() {
return "CSDN_datas{" +
"date=" + date +
", name='" + name + '\'' +
", codeAge='" + codeAge + '\'' +
", introduction='" + introduction + '\'' +
", yc=" + yc +
", fans=" + fans +
", zan=" + zan +
", content=" + content +
", cust=" + cust +
", jf=" + jf +
", sc=" + sc +
", WeekGrade=" + WeekGrade +
", TotalGrade=" + TotalGrade +
", Grade=" + Grade +
'}';
}
}
获取txt并转换:
import java.io.*;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class TxtUtils {
public static String txt2String(File file) throws IOException, ParseException {
CSDN_datas datas=new CSDN_datas();
String result = "";
BufferedReader br = new BufferedReader(new FileReader(file));
String s = null;
s = br.readLine();
result+="此次数据更新时间:"+s+"\n";
DateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
datas.setDate(format.parse(s));
s = br.readLine();
result+="昵称:"+s+"\n";
datas.setName(s);
s = br.readLine();
result+="码龄:"+s.toCharArray()[2]+"年"+"\n";
datas.setCodeAge(s.toCharArray()[2]+"年");
s = br.readLine();
result+="个人简介:"+s+"\n";
datas.setIntroduction(s);
s = br.readLine();
result+="原创文章数:"+s+"\n";
datas.setYc(Integer.valueOf(s));
s = br.readLine();
result+="粉丝数:"+s+"\n";
datas.setFans(Integer.valueOf(s));
s = br.readLine();
result+="获赞数:"+s+"\n";
datas.setZan(Integer.valueOf(s));
s = br.readLine();
result+="评论数:"+s+"\n";
datas.setContent(Integer.valueOf(s));
s = br.readLine();
result+="总访客:"+s+"\n";
datas.setCust(Integer.valueOf(s));
s = br.readLine();
result+="积分数:"+s+"\n";
datas.setJf(Integer.valueOf(s));
s = br.readLine();
result+="文章被收藏数:"+s+"\n";
datas.setSc(Integer.valueOf(s));
s = br.readLine();
result+="周排名:"+s+"\n";
datas.setWeekGrade(Integer.valueOf(s));
s = br.readLine();
result+="总排名:"+s+"\n";
datas.setTotalGrade(Integer.valueOf(s));
s = br.readLine();
result+="等级:"+s.toCharArray()[0]+"\n";
datas.setGrade(Integer.valueOf(s.toCharArray()[0])-48);
br.close();
System.out.println(datas);
return result;
}
public static void main(String[] args) throws IOException, ParseException {
File file = new File("D://DeskTop/csdn.txt");
System.out.println(txt2String(file));
}
}
接下来就可以自由发挥,将数据存入数据库,或者每隔多久对数据进行一次分析等等。。