package org.net.ht.controller;
import java.io.IOException;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HtmlPage {
/**
* 1、使用 HttpClient 获取网页文档对象
* 2、使用 Jsoup 获取所需的信息
* 3、注意事项:
* ① 设置连接超时:其实就是让它不再继续,做无意义的尝试
* ② 爬虫时被屏蔽了,就更换代理 IP
* ③ 有些网站是设置只能浏览器才能访问的,这时候就要模拟浏览器
* ④ 有些网站的编码不一定是UTF-8,也有可能是GBK
*
*
*/
public static void main(String[] args) throws ClientProtocolException, IOException {
// ============================= 【HttpClient�?====================================
// 创建httpclient实例
CloseableHttpClient httpclient = HttpClients.createDefault();
// 创建httpget实例
HttpGet httpget = new HttpGet("要抓取数据的url 连接");
// 模拟浏览�?�?
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
// 执行get请求
CloseableHttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
// 获取返回实体
String content = EntityUtils.toString(entity, "utf-8");
// ============================= 【Jsoup�?====================================
//获取响应类型、内�?
/*System.out.println("Status:"+response.getStatusLine().getStatusCode());
System.out.println("Content-Type:"+entity.getContentType().getValue());*/
// 解析网页 得到文档对象
Document doc = Jsoup.parse(content);
//Elements elements = doc.getElementsByTag("li"); // 获取tag是a的所有DOM元素,数�?
Elements elements = doc.getElementsByClass("half_rhythm");
for(int i = 2;i<elements.size();i++){
System.out.println("第"+i+"行开始");
//获取中文�?
String chinaeseName = elements.get(i).select("a").get(0).text();
//获取英文�?
String englishName = elements.get(i).select("a").get(1).text();
//获取第一页中文内容的连接
String href = "url/"+elements.get(i).select("a").get(0).attr("href");
String href1 = elements.get(i).select("a").get(1).attr("href");
//System.out.println(href1);
//获取中文 第二页内�?
Document chinaDoc = getDoc(href);
Element nav= chinaDoc.select(".jig-ncbiinpagenav.body-content.whole_rhythm").get(0);
Elements div = nav.select(">div");
String text="";
for (int j = 0; j < div.size(); j++) {
String textj = div.get(j).text();
if(!textj.startsWith("遗传咨询")){
text+=div.get(j).text().replace("' s", " is").replace("'s", " is").replace("'", " ")+"\n";
}else{
text+=div.get(j).text().replace("' s", " is").replace("'s", " is").replace("'", " ")+"\n";
break;
}
}
//获取英文 第二页内�?
Map<String,String> headers = new HashMap<String, String>();
headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
headers.put("Host", "www.omim.org");
headers.put("Connection", "keep-alive");
headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
Document englishDoc = getDocUserHeaders(href1,headers);
//英文临床特征
Element nav1= englishDoc.select(".body-content.whole_rhythm").get(0);
String engText = nav1.select(">div").get(0).text().replace("'s", " is").replace("'", " ");
//连接数据库
java.sql.Connection conn = null;
// MySQL的JDBC URL编写方式:jdbc:mysql://主机名称:连接端口/数据库的名称?参数=值
// 避免中文乱码要指定useUnicode和characterEncoding
// 执行数据库操作之前要在数据库管理系统上创建一个数据库,名字自己定,
// 下面语句之前就要先创建javademo数据库
String url = "jdbc:mysql://localhost:3306/数据库名称?"
+ "user=用户名&password=密码&useUnicode=true&characterEncoding=UTF8";
try {
// 之所以要使用下面这条语句,是因为要使用MySQL的驱动,所以我们要把它驱动起来,
// 可以通过Class.forName把它加载进去,也可以通过初始化来驱动起来,下面三种形式都可以
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
// or:
// com.mysql.jdbc.Driver driver = new com.mysql.jdbc.Driver();
// or:
// new com.mysql.jdbc.Driver();
System.out.println("成功加载MySQL驱动程序");
// 一个Connection代表一个数据库连接
conn = DriverManager.getConnection(url);
// Statement里面带有很多方法,比如executeUpdate可以实现插入,更新和删除等
java.sql.Statement stmt = conn.createStatement();
String sql = "insert into vivews(chinaName,englishName,chinaClinicalFeatures,clinicalFeatures) values('"+chinaeseName+"','"+englishName+"','"+text+"','"+engText+"')";
System.out.println(sql);
int result = stmt.executeUpdate(sql);
if(result != -1){
System.out.println("执行成功!!!!");
}else{
System.out.println("执行失败!!!!");
System.out.println("名字为:"+chinaeseName +"执行失败");
}
System.out.println("第"+i+"行结束");
}catch (SQLException e) {
System.out.println("MySQL操作错误");
System.out.println("名字为:"+chinaeseName +"执行失败");
e.printStackTrace();
}catch (Exception e) {
e.printStackTrace();
} finally {
try {
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
response.close(); // response关闭
httpclient.close(); // httpClient关闭
}
public static Document getDoc(String url){
Document doc = null;
try {
//Trust.trustAllHttpsCertificates();
doc = Jsoup.connect(url).timeout(60000).maxBodySize(0).get();
} catch (Exception e) {
e.printStackTrace();
}
return doc;
}
public static Document getDocUserHeaders(String url,Map<String,String> headers){
Document doc = null;
try {
//Trust.trustAllHttpsCertificates();
Connection conn = Jsoup.connect(url);
for(String key : headers.keySet()){
conn.header(key, headers.get(key));
}
doc = conn.timeout(60000).maxBodySize(0).get();
} catch (Exception e) {
throw new RuntimeException(e);
}
return doc;
}
}