简单的爬取网站数据demo
思路
- 网站如果有获取数据的api,我们可以考虑利用api来获取数据
- 网站没有api或者api封装的比较谨慎,我们则需要获取页面源码并解析html来获取数据.
引入依赖
该依赖作用主要是将html网页源码转为对象
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
获取html源码并解析demo
package jpgk.text.demo;
import org.apache.poi.hssf.usermodel.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.FileOutputStream;
import java.util.ArrayList;
import java.util.List;
/**
* 抓取网页数据
*/
public class GetHtmlPage {
@Test
public void test1() throws Exception {
//创建一个excell文件
HSSFWorkbook workbook = new HSSFWorkbook();
//标题头
HSSFCellStyle headStyle = getHeadCellStyle(workbook);
//标题字体
HSSFCellStyle titleStyle = getTitleCellStyle(workbook);
//创建一个sheet页
HSSFSheet sheet = workbook.createSheet("花桥房价");
List<String> stringList = new ArrayList<>();
for (int p = 1; p < 100; p++) {
StringBuffer requestUrl = new StringBuffer();
requestUrl.append("https://shanghai.*****.com/sale/");
if (p > 1) {
requestUrl.append("p");
requestUrl.append(p);
requestUrl.append("-");
}
requestUrl.append("rd1/?from=zjsr&kw=%E8%8A%B1%E6%A1%A5");
requestUrl.append("#filtersort");
System.err.println("开始抓取第" + p + "页数据");
//开始抓取数据
try {
//获得一个和网站的链接,注意是Jsoup的connect
Connection connect = Jsoup.connect(requestUrl.toString());
//获得该网站的Document对象
Document document = connect.get();
Element element = document.getElementById("houselist-mod-new");
Elements elementsContainingText = element.getElementsByClass("unit-price");
for (int i = 0; i < elementsContainingText.size(); i++) {
Element element1 = elementsContainingText.get(i);
String text = (element1.text()).replace("元/m²", "");
stringList.add(text);
}
} catch (Exception e) {
e.printStackTrace();
}
}
System.err.println("花桥房屋单价:" + stringList);
System.err.println("花桥房屋单价数据条数:" + stringList.size());
for (int i = 0; i < stringList.size(); i++) {
HSSFRow row = sheet.createRow(i);
String str = stringList.get(i);
HSSFCell hssfCell = row.createCell(0);
hssfCell.setCellValue(str);
}
FileOutputStream fileOutputStream = new FileOutputStream("D:\\花桥放房价.xls");
workbook.write(fileOutputStream);
fileOutputStream.close();
}
private HSSFCellStyle getHeadCellStyle(HSSFWorkbook workbook) {
HSSFFont headFont = workbook.createFont();
headFont.setFontName("Arial");
headFont.setFontHeightInPoints((short) 18);
headFont.setColor((short) 56);
HSSFCellStyle headStyle = workbook.createCellStyle();
//垂直
headStyle.setVerticalAlignment(HSSFCellStyle.VERTICAL_CENTER);
//水平
headStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);
headStyle.setFont(headFont);
return headStyle;
}
private HSSFCellStyle getTitleCellStyle(HSSFWorkbook workbook) {
HSSFFont titleFont = workbook.createFont();
titleFont.setFontName("Arial");
titleFont.setFontHeightInPoints((short) 11);
//titleFont.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);
titleFont.setColor((short) 56);
HSSFCellStyle titleStyle = workbook.createCellStyle();
//垂直
titleStyle.setVerticalAlignment(HSSFCellStyle.VERTICAL_CENTER);
//水平
titleStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);
titleStyle.setFont(titleFont);
return titleStyle;
}
}