用Java获取页面,然后用Jsoup来得到自己想要的数据,再保存到数据库(我用了Hibernate的框架),最后用自己的网站显示这些数据
豆瓣本身貌似提供了给开发者使用的接口,但是我不想去注册账号(我还没有豆瓣账号),,,就想自己通过网页源码分析,然后拿到自己想要的数据。
在看豆瓣的网页源码的时候,通过用Chrome的F12分析工具中的NetWork网络分析,发现了豆瓣一个请求的接口,其返回值是Json数组,数组包含电影评分,名字,图片的url,导演等数据
用QQ的截屏工具貌似还很好用(Ctrl+Alt+A)
通过这个接口可以获取豆瓣的电影(评分由高到低,每次请求只会返回20个数据,但通过更改url里的start的数值(从这位置开始,向后获得20个数据),可以获取更多的数据
现在Java有别人写好的现成的解析Json数据的包,但这次我没用,自己尝试去解析这个Json数据,---用正则表达式,和String字符串的方法,来获取自己想要的数据,然后封装成类,保存到数据库
代码写的有些乱,中间出错过几次,写的方法代码有的注释掉了,有的方法没有用,感觉错误也是有保留价值的,,,(这是我自己写的,来练手的),
下面代码获取了Json数据,main函数也在这:
1 packagezhangtianxiao;2
3 import java.io.*;4 importjava.net.HttpURLConnection;5 importjava.net.MalformedURLException;6 importjava.net.URL;7
8 importorg.jsoup.Jsoup;9 importorg.jsoup.nodes.Document;10 importorg.jsoup.nodes.Element;11 importorg.jsoup.select.Elements;12
13 importJson.JsonParase;14
15 public classDownLoadHTML {16 public String s_html = "";17 public String url = "";18 publicDownLoadHTML(String url) {19 this.url =url;20
21 try{22 URL urlc =newURL(url);23 InputStream is =urlc.openStream();24 BufferedReader br = new BufferedReader(newInputStreamReader(is));25
26 String s1 = null;27
28 while((s1 = br.readLine()) != null)29 {30 s_html +=s1;31 //System.out.println(s1);
32 }33
34 } catch(IOException e) {35 //TODO Auto-generated catch block
36 e.printStackTrace();37 }38
39 }40
41 public voidoutPut()42 {43 //Document doc = null;44 //try {45 //doc = Jsoup.connect("https://movie.douban.com/").get();46 //} catch (IOException e1) {47 // //TODO Auto-generated catch block48 //e1.printStackTrace();49 //}50 //Elements elements = doc.select("a");51 //for(Element e : elements)52 //{53 //System.out.println(e.toString());54 //}
55 File f1 = new File("E:/java/资源/1.html");56 try{57 Document doc = Jsoup.parse(f1,"utf-8","");58 Elements elements = doc.select("a.item");59 for(Element e : elements)60 {61 System.out.println(e.toString()+"\n\n");62 System.out.println(e.attr("href"));63 }64 } catch(IOException e) {65 //TODO Auto-generated catch block
66 e.printStackTrace();67 }68
69 }70
71 public static voidmain(String[] args)72 {73 DownLoadHTML d = new DownLoadHTML("https://movie.douban.com/tag/#/?sort=S&range=0,10&tags=");74 //d.outPut();
75
76 d.getDetail1();77
78 }79
80
81 //测试用获取页面用的方法
82 public voidgetDetail()83 {84 String url = "https://movie.douban.com/subject/24751811/";85 try{86 Document doc =Jsoup.connect(url).get();87 //System.out.println(doc.toString());
88 Element e = doc.selectFirst("span.short").child(0);89 System.out.println(e.text());90 Element e1 = doc.selectFirst("a.lnk-sharing");91 System.out.println(e1.attr("data-name"));92 } catch(IOException e) {93 //TODO Auto-generated catch block
94 e.printStackTrace();95 }96
97 }98
99 //由于上面的方法无法解析服务器发回的数据,网上搜了一下
100 public voidgetDetail1()101 {102
103
104
105 HttpURLConnection conn = null;106 try{107 URL realUrl = new URL("https://movie.douban.com/j/new_search_subjects?sort=S&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=0");108 conn =(HttpURLConnection)realUrl.openConnection();109 conn.setRequestMethod("GET");110 conn.setUseCaches(false);111 conn.setReadTimeout(8000);112 conn.setConnectTimeout(8000);113 conn.setInstanceFollowRedirects(false);114 }catch(Exception e){115 e.printStackTrace();116 }117
118
119 try{120 int responseCode =conn.getResponseCode();121 System.out.println(responseCode);122 } catch(IOException e) {123 //TODO Auto-generated catch block
124 e.printStackTrace();125 }126
127
128 BufferedReader in = null;129
130 String result = "";131
132 try{133 in = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));134 String line;135 while((line = in.readLine()) != null)136 {137 System.out.println(line);138 result +=line;139 }140 } catch(UnsupportedEncodingException e) {141 //TODO Auto-generated catch block
142 e.printStackTrace();143 } catch(IOException e) {144 //TODO Auto-generated catch block
145 e.printStackTrace();146 }147
148 JsonParase.parase(result);149 }150
151
152
153 }
然后怕代码太长,把自己搞蒙了,把解析Json的工作,拿了出来单独一个类解析Json数据:
1 packageJson;2
3 importjava.io.IOException;4 importjava.util.regex.Matcher;5 importjava.util.regex.Pattern;6
7 importorg.hibernate.Session;8 importorg.hibernate.Transaction;9 importorg.jsoup.Jsoup;10 importorg.jsoup.nodes.Document;11 importorg.jsoup.nodes.Element;12
13 public classJsonParase {14 public static voidparase(String json)15 {16 String regex = "(\\{\"directors\":)(.+)(\\})";17 String regex1 = "(?<=\\{)(\"dire.+?)(?=\\})";18 Pattern p =Pattern.compile(regex1);19 Matcher m =p.matcher(json);20 while(m.find())21 {22 String e1 = null;23 System.out.println(e1 =m.group());24 String regex2 = "(?<=\")(.+?)(?=\")";25
26 //Matcher m1 = Pattern.compile(regex2).matcher(e1);27 //while(m1.find())28 //{29 //System.out.println(m1.group());30 //}
31
32 String[] ms = e1.split(",");33 Movie movie = newMovie();34 //System.out.println(ms.length);
35 for(String m1 : ms)36 {37 //System.out.println(m1);
38
39 Matcher matcher1 = Pattern.compile(regex2).matcher(m1.replace(":", ""));40
41 while(matcher1.find())42 {43 //System.out.println(matcher1.group());
44 if(matcher1.group().equals("title"))45 {46 matcher1.find();47 //System.out.println(matcher1.group()+"****************");
48 movie.setTitle(matcher1.group().replace("\"", ""));49 System.out.println(movie.getTitle());50 }51 if(matcher1.group().equals("rate"))52 {53 matcher1.find();54 //System.out.println(matcher1.group()+"****************");
55 movie.setRate(matcher1.group().replace("\"", ""));56 System.out.println(movie.getRate());57 }58 if(matcher1.group().equals("url"))59 {60 matcher1.find();61 //System.out.println(matcher1.group()+"****************");
62 movie.setUrl(matcher1.group().replace("\"", "").replaceAll("https", "https:"));63 System.out.println(movie.getUrl());64 }65 if(matcher1.group().equals("cover"))66 {67 matcher1.find();68 //System.out.println(matcher1.group()+"****************");
69 movie.setCoverurl(matcher1.group().replace("\"", "").replaceAll("https", "https:"));70 System.out.println(movie.getCoverurl());71 }72 }73
74 //没有考虑到http://***********这种类型数据75 //String[] ms1 = m1.split(":");76 //
77 //
78 //
79 //if(ms1.length == 2)80 //{81 //Movie movie = new Movie();82 // //System.out.println(ms1[0]);83 // //System.out.println(ms1[0].replaceAll("\"", ""));84 //if(ms1[0].replaceAll("\"", "") == "title")85 //{86 //movie.setTitle(ms1[1].replaceAll("\"", ""));87 //}88 //if(ms1[0].replaceAll("\"", "") == "rate")89 //{90 //movie.setRate(ms1[1].replaceAll("\"", ""));91 //}92 //if(ms1[0].replaceAll("\"", "") == "title")93 //{94 //movie.setTitle(ms1[1].replaceAll("\"", ""));95 //}96 //}97 // //System.out.println();
98 }99 getComment(movie);100 storeMovie(movie);101
102 }103 }104 public static voidgetComment(Movie m)105 {106 //System.out.println(m.getUrl());
107 String url = m.getUrl().replace("\\", "");108 try{109 Document doc =Jsoup.connect(url).get();110 //System.out.println(doc);
111
112 if(doc.selectFirst("span.short") == null)113 {114 if(doc.selectFirst("div#link-report") == null)115 {116 //System.out.println("---------------------------");
117 return;118 }119 Element e2 = doc.selectFirst("div#link-report");120 //System.out.println("************\n"+e2.child(0).text());
121 m.setComment(e2.child(0).text());122 return;123 }124
125 Element e = doc.selectFirst("span.short").child(0);126
127 //System.out.println(e.text());
128
129 m.setComment(e.text());130 Element e1 = doc.selectFirst("a.lnk-sharing");131 //System.out.println(e1.attr("data-name"));
132 } catch(IOException e) {133 //TODO Auto-generated catch block
134 e.printStackTrace();135 }136 }137 public static voidstoreMovie(Movie m)138 {139 Session session =HibernateTools.getSession();140 Transaction tx =session.beginTransaction();141 session.save(m);142 tx.commit();143 session.close();144 //HibernateTools.closeSessionFactory();
145 }146 }
代码用到了Jsoup(这个库感觉挺好用,可以像写Js代码一样来操纵Html页面的元素),然后Hibernate来访问数据库,
上面代码获取了20条数据,保存到数据库,然后我在自己写的网页上读取了这写数据:
自己尝试去解析Json数据还挺麻烦的,而且效果不是很好,没有别人封装好的简洁,有空去看看别人的源码,学习学习,,,,,,,,,,,,,,,,,,