本篇文章主要介绍用java爬取我主良缘网站上女孩的信息。用URL进行网站,用jxl把爬取到信息存储到excel表中,jxl的应用请参考另一篇文章java 中JXL操作Excel实例详解;爬取信息时获取到的是json格式的信息,解析json以及json的应用请参考另一篇文章这里写链接内容。
爬取程序如下:
package com.lzj.spider;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
/**
* Hello world!
*
*/
public class App
{
static WritableWorkbook[] books = new WritableWorkbook[4];
static WritableSheet[] sheets = new WritableSheet[4];
public static void main( String[] args )
{
/*condition*/
int page = 0;
String gender = "2"; /*女*/
List<Person> persons = null;
InputStream inputStream = null;
InputStreamReader inputStreamReader = null;
BufferedReader bufferedReader = null;
int row0 = 1;
int row1 = 1;
int row2 = 1;
int row3 = 1;
try {
/*create excel*/
createExcel();
while(true){
String condition = "page=" + page + "&gender=" + gender;
URL url = new URL("http://www.lovewzly.com/api/user/pc/list/search?" + condition);
inputStream = url.openStream();
inputStreamReader = new InputStreamReader(inputStream, "utf-8");
bufferedReader = new BufferedReader(inputStreamReader);
/*parser data*/
persons = MyService.execute(bufferedReader);
if (persons == null || persons.size() == 0) {
System.out.println("over");
break;
}
/*store girls information to excel*/
for(Person person : persons){
if (person.getBirthdayyear() == null || person.getBirthdayyear().trim().isEmpty()) {
break;
}
if (person.getBirthdayyear().compareTo("1998") >= 0) {
/*younger than 20*/
MyService.writeExcel(books[0], sheets[0], row0, person);
MyService.getPicture(person, "E:/MyData/picture/小于20岁");
row0++;
} else if (person.getBirthdayyear().compareTo("1998") < 0 && person.getBirthdayyear().compareTo("1988") >= 0) {
/*old than 20 and younger than 30*/
MyService.writeExcel(books[1], sheets[1], row1, person);
MyService.getPicture(person, "E:/MyData/picture/20-30岁");
row1++;
} else if (person.getBirthdayyear().compareTo("1988") < 0 && person.getBirthdayyear().compareTo("1978") > 0) {
/*old than 30 and younger than 40*/
MyService.writeExcel(books[2], sheets[2], row2, person);
MyService.getPicture(person, "E:/MyData/picture/30-40岁");
row2++;
} else {
/*old than 40*/
MyService.writeExcel(books[3], sheets[3], row3, person);
MyService.getPicture(person, "E:/MyData/picture/大于50岁");
row3++;
}
}
++page;
// if (page == 1) {
// break;
// }
}
} catch (Exception e) {
System.out.println("over");
e.printStackTrace();
} finally {
System.out.println("hello finally");
try {
for(WritableWorkbook book : books){
book.write();
book.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
try {
bufferedReader.close();
inputStreamReader.close();
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void createExcel() throws IOException, RowsExceededException, WriteException{
books[0] = Workbook.createWorkbook(new File("E:/MyData/20岁以下少女.xls"));
books[1] = Workbook.createWorkbook(new File("E:/MyData/20-30岁青年女孩.xls"));
books[2] = Workbook.createWorkbook(new File("E:/MyData/30-40岁中年女人.xls"));
books[3] = Workbook.createWorkbook(new File("E:/MyData/50岁以上中老年妇女.xls"));
sheets[0] = books[0].createSheet("第一页", 0);
sheets[1] = books[1].createSheet("第一页", 0);
sheets[2] = books[2].createSheet("第一页", 0);
sheets[3] = books[3].createSheet("第一页", 0);
int i;
for(i=0; i<4; i++){
sheets[i].addCell(new Label(0, 0, "userid"));
sheets[i].addCell(new Label(1, 0, "username"));
sheets[i].addCell(new Label(2, 0, "gender"));
sheets[i].addCell(new Label(3, 0, "education"));
sheets[i].addCell(new Label(4, 0, "height"));
sheets[i].addCell(new Label(5, 0, "birthdayyear"));
sheets[i].addCell(new Label(6, 0, "province"));
sheets[i].addCell(new Label(7, 0, "city"));
sheets[i].addCell(new Label(8, 0, "monolog"));
}
}
}
package com.lzj.spider;
public class Person {
private String userid;
private String username;
private String gender; /*1:male 2:female*/
private String education;
private String height;
private String birthdayyear;
private String province;
private String city;
private String monolog;
private String avatar; /*picture*/
/*省略get/set/toString方法*/
}
package com.lzj.spider;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import javax.imageio.stream.FileCacheImageInputStream;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class MyService {
/*paser json data*/
public static List<Person> execute(Reader reader){
GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create();
JsonParser parser = new JsonParser();
JsonElement jsonElement = parser.parse(reader);
JsonObject jsonObject = jsonElement.getAsJsonObject();
JsonObject jsonObjectData = jsonObject.getAsJsonObject("data");
int num = jsonObjectData.get("num").getAsInt();
if (num == 0) {
return null;
}
JsonArray jsonArrayList = jsonObjectData.getAsJsonArray("list");
List<Person> persons = new ArrayList<Person>();
for(JsonElement element : jsonArrayList){
Person person = gson.fromJson(element, Person.class);
persons.add(person);
}
return persons;
}
public static Map<String, Object> createExcel() throws IOException, RowsExceededException, WriteException{
Map<String, Object> excels = new HashMap<String, Object>();
WritableWorkbook[] book = new WritableWorkbook[4];
WritableSheet[] sheet = new WritableSheet[4];
book[0] = Workbook.createWorkbook(new File("E:/MyData/20岁以下少女.xls"));
book[1] = Workbook.createWorkbook(new File("E:/MyData/20-30岁青年女孩.xls"));
book[2] = Workbook.createWorkbook(new File("E:/MyData/30-40岁中年女人.xls"));
book[3] = Workbook.createWorkbook(new File("E:/MyData/50岁以上中老年妇女.xls"));
sheet[0] = book[0].createSheet("第一页", 0);
sheet[1] = book[1].createSheet("第一页", 0);
sheet[2] = book[2].createSheet("第一页", 0);
sheet[3] = book[3].createSheet("第一页", 0);
int i;
for(i=0; i<4; i++){
sheet[i].addCell(new Label(0, 0, "userid"));
sheet[i].addCell(new Label(1, 0, "username"));
sheet[i].addCell(new Label(2, 0, "gender"));
sheet[i].addCell(new Label(3, 0, "education"));
sheet[i].addCell(new Label(4, 0, "height"));
sheet[i].addCell(new Label(5, 0, "birthdayyear"));
sheet[i].addCell(new Label(6, 0, "province"));
sheet[i].addCell(new Label(7, 0, "city"));
sheet[i].addCell(new Label(8, 0, "monolog"));
// book[i].write();
}
excels.put("book", book);
excels.put("sheet", sheet);
return excels;
}
public static void writeExcel(WritableWorkbook book, WritableSheet sheet, Integer row, Person person) throws RowsExceededException, WriteException, IOException{
System.out.println("&&&&: " + row);
System.out.println("person : " + person);
sheet.addCell(new Label(0, row, person.getUserid()));
sheet.addCell(new Label(1, row, person.getUsername()));
sheet.addCell(new Label(2, row, person.getGender()));
sheet.addCell(new Label(3, row, person.getEducation()));
sheet.addCell(new Label(4, row, person.getHeight()));
sheet.addCell(new Label(5, row, person.getBirthdayyear()));
sheet.addCell(new Label(6, row, person.getProvince()));
sheet.addCell(new Label(7, row, person.getCity()));
sheet.addCell(new Label(8, row, person.getMonolog()));
}
/*first advice*/
public static void getPicture(Person person, String path){
String urlString = person.getAvatar();
if (urlString == null || urlString.trim().isEmpty()) {
return;
}
String file = path + "/" + person.getUserid() + "-" + person.getUsername() + "-" + person.getBirthdayyear() + ".jpg";
try {
URL url = new URL(person.getAvatar());
BufferedImage img = ImageIO.read(url);
ImageIO.write(img, "jpg", new File(file));
} catch (Exception e) {
e.printStackTrace();
}
}
/*second advice*/
public static void getPicture2(Person person, String path){
FileOutputStream out = null;
BufferedInputStream in = null;
HttpURLConnection connection = null;
byte[] buf = new byte[1024];
int len = 0;
try {
URL url = new URL(person.getAvatar());
connection = (HttpURLConnection)url.openConnection();
connection.connect();
in = new BufferedInputStream(connection.getInputStream());
String file = path + "/" + person.getUserid() + "-" + person.getUsername() + "-" + person.getBirthdayyear() + ".jpg";
out = new FileOutputStream(file);
while ((len = in.read(buf)) != -1) {
out.write(buf, 0, len);
}
out.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
in.close();
out.close();
connection.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}