java中简单爬取网站信息以及生成图片的缩略图
获取当前网站的Code
/**
* 获取当前网页的code
*
* @param httpUrl 网页地址
* @return
* @throws IOException
*/
public static String getHtmlCode(String httpUrl) throws IOException {
String content = ""; // 定义字符串content
URL url = new URL(httpUrl); // 生成传入的URL的对象
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.setDoInput(true);
connection.setDoOutput(true);
connection.setRequestMethod("GET");
connection.connect();
String fileEncode ="UTF-8";
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), fileEncode));// 获得当前url的字节流(缓冲)
String input;
while ((input = reader.readLine()) != null) { // 当前行存在数据时
content += input; // 将读取数据赋给content
}
reader.close(); // 关闭缓冲区
return content;
}
引入jsoup包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
然后使用jsoup进行数据的提取,这里以博库网为例
for (Element element1 : doc.select("div[class=wd-980 fl scale-box br-1-e8]").select("div[class=wd-640 fl]")) {
//通过jscouo获取类型作者,出版社,定价,售价,出版时间,副标题
String publishingtime = element1.select("div[class=wd-30p fl to-hd cl-9]").text();
String a = element1.select("a[class=db fs-16 lh-30 to-hd fw-bd hover]").attr("href");
String title = element1.select("a[class=db fs-16 lh-30 to-hd fw-bd hover]").text();
String price = element1.select("div[class=lh-30]").select("del[class=cl-9 mr-10]").text();
String author = element1.select("div[class=wd-30p fl to-hd mr-10]").select("a").attr("href");
String publisher = element1.select("div[class=wd-30p fl to-hd cl-9 mr-10]").select("a[class=hover]").text();
//String oprice=element1.select("div[class=lh-30]").select("span[class=fs-21 cl-rd-l fw-bd mr-20]").text();
String sub_desc=element1.select("div[class=cl-9 lh-20 ht-40 oh fs-12]").text();
//通过第二个链接获取简介和isbn码
String D= GetWeb.getHtmlCode("https:"+a);
Document HTML1 = Jsoup.parse(D);
//读取图片地址
String imags1=HTML1.select("div[id=slider]").select("li[class=pr cp]").attr("data-thumb");
//爬取图片
//给图片重命名
String filename1=UUID.randomUUID().toString();
if(filename1!=null) {
//获取图片
URL imgURL = new URL(imags1.trim());//转换URL
HttpURLConnection urlConn = (HttpURLConnection) imgURL.openConnection();//构造连接
urlConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36");
urlConn.connect();
//存储地址
String path = PathUtil.getClasspath() + "/uploadFiles/bookimages";
if (urlConn.getResponseCode() == 200) {//返回的状态码是200 表示成功
InputStream ins = urlConn.getInputStream(); //获取输入流,从网站读取数据到 内存中
FileOutputStream fos = FileUtils.openOutputStream(new File(path + "/" + filename1 + ".jpg"));//打开FileOutStrean流
IOUtils.copy(ins, fos);//将MultipartFile file转成二进制流并输入到FileOutStrean
fos.close();
}
//获得缩略图
String pathname=path+"/"+filename1+"yasuo.jpg";//
File file=new File(pathname);
ReduceImg.downloadCompressedPicture(file, imags1);
}
获取缩略图
public class ReduceImg {
/**
* url下载压缩图
* 需要jdk8
*/
public static boolean downloadCompressedPicture(File file,String urlstr){
URL url=null;
try{
url=new URL(urlstr);
//1.获取url的输入流 dataInputStream
DataInputStream dataInputStream=new DataInputStream(url.openStream());
//2.加一层BufferedInputStream
BufferedInputStream bufferedInputStream=new BufferedInputStream(dataInputStream);
//3.构造原始图片流 preImage
BufferedImage preImage=ImageIO.read(bufferedInputStream);
//4.获得原始图片的长宽 width/height
int width=preImage.getWidth();
int height=preImage.getHeight();
//5.构造压缩后的图片流 image 长宽各为原来的几分之几
BufferedImage image=new BufferedImage(width/6, height/6, BufferedImage.TYPE_INT_RGB);
//6.给image创建Graphic ,在Graphic上绘制压缩后的图片
Graphics graphic=image.createGraphics();
graphic.drawImage(preImage, 0, 0, width/6, height/6, null);
//7.为file生成对应的文件输出流
//将image传给输出流
FileOutputStream fileOutputStream = new FileOutputStream(file);
BufferedOutputStream bufferedOutputStream=new BufferedOutputStream(fileOutputStream);
//8.将image写入到file中
ImageIO.write(image, "bmp", bufferedOutputStream);
//9.关闭输入输出流
bufferedInputStream.close();
bufferedOutputStream.close();
return true;
}catch(IOException e){
System.out.println(e);
}
return false;
}