前言
之前写了一个爬取网络小说网站,抓取内容->分析内容->生成数据,再通过程序自动根据模板生成萧索页面发布页面的完整小说项目。现在也一直在用不到,我在这里分享出来供需要的朋友参考使用。
使用的技术:
java语言 jdk1.8
框架:springboot 版本号随意 根据自己的调整
mysql 5.8
涉及到的额外技术: velocity模板
lucene:索引使用
直接上代码
爬数据
// 初始化 防止https 验证
public void init() {
try {
SSLContext context = SSLContext.getInstance(“TLS”);
context.init(null, new X509TrustManager[]{new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}}, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
e.printStackTrace();
}
}
public static void trustEveryone() {
try {
HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
});
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[] { new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
} }, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (Exception e) {
// e.printStackTrace();
}
}
public Map<String, Object> getAutoCrawlEntData(HttpServletRequest request, HttpServletResponse response) {
try{
String url =request.getParameter(“url”);
String errorUrl = request.getParameter("errorUrl");
String totalIndex = request.getParameter("totalIndex");
int totalPageSize = 12;
if(totalIndex != null){
totalPageSize = Integer.parseInt(totalIndex);
}
url = "https://www.aaa.io/c/yanqing_3.html";
// makeVideo(1,url,master,director,year,area,typeName);
// 定义代理服务器的地址和端口号
String proxyHost = "127.0.0.1";
int proxyPort = 10809;
int index = 0;
String indexStr = request.getParameter(“index”);
index = Integer.parseInt(indexStr);
int page = 0;
int category = 2;
String categoryName = "言情";
System.out.println("当前页码:"+page);
Document doc_list = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3").timeout(30000).get();
Elements rows = doc_list.getElementsByClass("row");
for(int k = index;k < totalPageSize;k++){
Element element = rows.get(k);
Elements h3s = element.getElementsByTag("h3");
if(h3s != null){
}
存数据
//生成书籍
Long insertNovel(String title,String author,int category,String image,String status,String desc,String typeName) throws BadHanyuPinyinOutputFormatCombination {
Novel query = new Novel(); query.setTitle(title);query.setAuthor(author);
Novel novel = novelDao.getArticleByTitle(query);
Long id = 0l;
if(novel != null){
id = novel.getId();
novel.setTitle(title);
novel.setAuthor(author);
novel.setNovelType(category);
novel.setTypeName(typeName);
novel.setImageUrl(image);
novel.setDescription(desc);
novel.setStatus(status);
novel.setUpdateTime(new Date());
SimpleDateFormat sdf = new SimpleDateFormat(“yyyy”);
Date date = new Date();
String y= sdf.format(date);
String link = novel.getNovelType()+“/”+y+“/”+ CharUtil.chineseToPinyin(novel.getTitle())+“/novel.html”; //存储路径
novel.setLinkUrl(link);
novelService.updateArticle(novel);
InformationIndexUtil u=new InformationIndexUtil();//索引文章
try {
u.indexArticles(false,novel);
} catch (IOException e) {
e.printStackTrace();
}
}else{
Novel newNovel = new Novel();
newNovel.setTitle(title);
newNovel.setAuthor(author);
newNovel.setNovelType(category);
newNovel.setTypeName(typeName);
newNovel.setImageUrl(image);
newNovel.setDescription(desc);
newNovel.setStatus(status);
newNovel.setViews(0);
newNovel.setCreateTime(new Date());
SimpleDateFormat sdf = new SimpleDateFormat(“yyyy”);
Date date = new Date();
String y= sdf.format(date);
String link = newNovel.getNovelType()+“/”+y+“/”+ CharUtil.chineseToPinyin(newNovel.getTitle())+“/novel.html”; //存储路径
newNovel.setLinkUrl(link);
novelService.addArticle(newNovel);
InformationIndexUtil u=new InformationIndexUtil();//索引文章
try {
u.indexArticles(false,newNovel);
} catch (IOException e) {
e.printStackTrace();
}
id= novelDao.getMaxId();
}
return id;
}
// 这里根据情况 生成对应的html页面
@RequestMapping(“/api/pushList”)
@ResponseBody
public Map<String, Object> pushList(HttpServletRequest request) {
try {
String req_type = request.getParameter(“req_type”);
int type = Integer.parseInt(req_type);
//String total = request.getParameter(“total”);
//int totalPage = Integer.parseInt(total);
Novel article1 = new Novel();
article1.setNovelType(type);
PageEntity page1 = new PageEntity();
this.setPage(page1);
page1.setCurrentPage(1);
this.getPage().setPageSize(10);
novelService.queryArticleListPage(article1,this.getPage());
System.out.println(“--------------------该分类总计-------------”+this.getPage().getTotalPageSize());
for(int k = 1; k <= this.getPage().getTotalPageSize(); k++){
PageEntity page = new PageEntity();
page.setCurrentPage(k);
// 页面传来的数据放到page中
this.setPage(page);
this.getPage().setPageSize(10);
List articleDataList = null;
Novel article = new Novel();
article.setNovelType(type);
articleDataList = novelService.queryArticleListPage(article,this.getPage());
if(articleDataList != null && articleDataList.size() > 0){
Map<String,Object> returnMap = publishInfo(articleDataList,this.getPage(), type);
}
}
this.setJson(true, "success", null);
} catch (Exception e) {
logger.error("AdminBlogController.pushIndex", e);
this.setJson(false, "false", null);
}
return json;
}