packagecom.sun.util;importjava.io.BufferedReader;importjava.io.File;importjava.io.FileNotFoundException;importjava.io.FileOutputStream;importjava.io.IOException;importjava.io.InputStreamReader;importjava.io.OutputStreamWriter;importjava.io.PrintWriter;importjava.net.MalformedURLException;importjava.net.URL;importjava.net.URLConnection;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;importorg.jsoup.nodes.Element;importorg.jsoup.select.Elements;public classDataDownUtil {/***@authorUPO
*@paramurl
*@paramencoding
*@returnString 网页的源代码
* 百度
* 爬取的网页
**/
public staticString getHtmlResourceByUrl(String url,String encoding){
StringBuffer buffer=newStringBuffer();
URL urlobj=null;
URLConnection uc=null;
InputStreamReader isr=null;
BufferedReader reader=null;try{//建立网络连接
urlobj=newURL(url);//打开网络
uc=urlobj.openConnection();//建立文件输入流的对象
isr=newInputStreamReader(uc.getInputStream(), encoding);//建立文件缓冲写入流(相当于ctrl+v放入内存中)
reader=newBufferedReader(isr);//建立临时变量
String temp=null;while((temp=reader.readLine())!=null){
buffer.append(temp);//buffer.append("\n");
}
}catch(MalformedURLException e) {//TODO Auto-generated catch block
e.printStackTrace();
System.out.println("网络连接不可用");
}catch(IOException e) {//TODO Auto-generated catch block
e.printStackTrace();
System.out.println("网络连接失败");
}finally{if(isr!=null){try{
isr.close();
}catch(IOException e) {//TODO Auto-generated catch block
e.printStackTrace();
}
}
}returnbuffer.toString();
}public staticString getContext(){
StringBuffer context=newStringBuffer();int start=0;while(start>=0&&start<=60){//查看网页url地址栏
String url="https://movie.douban.com/subject/26266893/comments?start="+start+"&limit=20&sort=new_score&status=P";
String encoding="utf-8";//观察可知每一页加载20个评价item
start=start+20;//1.获取网页源代码
String html=getHtmlResourceByUrl(url, encoding);//System.out.println(html);//2.解析
Document document=Jsoup.parse(html);//3.最外层的id是:comments
Element element=document.getElementById("comments");//4.里面的每一个item的id是:comment-item
Elements elements=element.getElementsByClass("comment-item");for(Element ele : elements) {//https://movie.douban.com/subject/3168101/comments?start=20&limit=20&sort=new_score&status=P
String name=ele.getElementsByTag("a").last().text();
String desc=ele.getElementsByClass("short").text();
String time=ele.getElementsByClass("comment-time").text();
String votes=ele.getElementsByClass("votes").text();//System.out.println("\nname:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes);
context.append("\n");
context.append("name:"+name+"\ndesc:"+desc+"\ntime:"+time+"\nvotes:"+votes);
context.append("\n");
}
}
System.out.println(context);returncontext.toString();
}/*** 将文件一行行写入到文件中
*@author孙敬钦
*@version1.0
*@paramcontent 解析到的文件内容
*@paramfilePath 存储的文件名字
*@returnvoid*/
public static voidwriteFileByLine(String context,String filePath){
File file=newFile(filePath);
PrintWriter printWriter=null;;try{
printWriter=new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8"));
printWriter.print(context);
printWriter.flush();
}catch(FileNotFoundException e) {//TODO Auto-generated catch block
e.printStackTrace();
}catch(IOException e) {//TODO Auto-generated catch block
e.printStackTrace();
}finally{//关闭printWriter
if(printWriter!=null){
printWriter.close();
}
}
}public static voidmain(String[] args) {
System.out.println("你好阿泡");//1.得到解析的网页数据
String context=getContext();
System.out.println(context);//2.保存到txt文件
String filePath="D:/movie/bigdata.txt";
writeFileByLine(context, filePath);//3.保存到hdfs文件系统
}
}