功能描述:
(1)、使用Java爬虫,爬取CSDN上某界面的标题及其对应的url,
(2)、将所爬取的数据存入MySQL数据库
(3)、将所爬取的数据生成CSV文件
所涉及的技术点:多线程、Java爬虫、饿汉式单例、CSV文件写入
本次实验主要使用的是hutool工具库
项目的文件结构:
DB.java
package com.chl.zz.Dao;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import org.omg.CORBA.PUBLIC_MEMBER;
import com.chl.zz.Service.Spider;
import com.chl.zz.model.Demo;
public class DB implements Runnable
{
ArrayList<Demo> lists = Spider.getConlists();
private static Connection conn;
static
{
try
{
Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://127.0.0.1:3306/chl_db", "root", "123456");
System.out.println("数据库连接成功" + conn);
} catch (ClassNotFoundException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SQLException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// 插入数据库
public void run()
{
String sql = "insert into t_test values(?,?) ";
for (Demo list : lists)
{
try
{
PreparedStatement prepareStatement = conn.prepareStatement(sql);
prepareStatement.setString(1, list.getContent());
prepareStatement.setString(2, list.getLink());
prepareStatement.executeUpdate();
} catch (SQLException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
Demo.java
package com.chl.zz.model;
public class Demo
{
private String link;
private String content;
public String getLink()
{
return link;
}
public void setLink(String link)
{
this.link = link;
}
public String getContent()
{
return content;
}
public void setContent(String content)
{
this.content = content;
}
}
InCsv.java
package com.chl.zz.Service;
import java.util.ArrayList;
import com.chl.zz.model.Demo;
import cn.hutool.core.text.csv.CsvUtil;
import cn.hutool.core.text.csv.CsvWriter;
import cn.hutool.core.util.CharsetUtil;
public class InCsv implements Runnable
{
CsvWriter csv;
ArrayList<Demo> lists=Spider.getConlists();
int num=Spider.getNum();
String[][] sa=new String[num][2];
public void run()
{ int row=0;
csv=CsvUtil.getWriter("D:/z1.csv",CharsetUtil.CHARSET_GBK);
//将泛型集合转换为数组
for(Demo list:lists)
{
sa[row][0]=list.getContent();
sa[row][1]=list.getLink();
row++;
}
//将转换后的String数组写入
for(String[] line:sa)
{
csv.write(line);
}
}
}
Spider.java
package com.chl.zz.Service;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.chl.zz.model.Demo;
public class Spider
{
private static ArrayList<Demo> conlists=new ArrayList<Demo>();
private static int num;
public Spider()
{
String url="https://blog.csdn.net/?spm=1001.2101.3001.4477";
Connection connect = Jsoup.connect(url);
System.out.println("网站连接成功"+connect);
try
{
Document doc = connect.get();
//System.out.println(doc);
List<Element> lists=doc.select("ul li .list_con");
for(Element list:lists)
{
String content=list.select(".title h2 a").text();
String link=list.select(".title h2 a").attr("href");
if(null!=content&&null!=link)
{
System.out.println(content+"---"+link);
//将爬取的数据放入集合
Demo demo=new Demo();
demo.setContent(content);
demo.setLink(link);
conlists.add(demo);
}
}
//验证是否数据存入集合
System.out.println(conlists.size());
num=conlists.size();
} catch (IOException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static ArrayList<Demo> getConlists()
{
return conlists;
}
public static int getNum()
{
return num;
}
}
App.java
package com.chl.zz;
import com.chl.zz.Dao.DB;
import com.chl.zz.Service.InCsv;
import com.chl.zz.Service.Spider;
public class App
{
public static void main( String[] args )
{
//启动爬虫
Spider spider=new Spider();
//数据写入数据库
new Thread(new DB()).start();
//数据写入csv文件
new Thread(new InCsv()).start();
}
}
通过本次实验,我认识了一个之前从未见过的运行时异常:MySQL Syntax Error Exception。产生的原因是因为在DB.java文件里面,所对应的sql命令写错了。