实体店选址大数据分析-保定地区-POI数据-powermap-爬虫获取真实数据
讲一下项目的思路
预期通过获得poi数据进行分析,以为实体店选址进行决策
数据源
58同城店铺转让数据
安居客小区详细信息数据
安居客小区平均房价数据
上爬虫代码
package wubaSpider;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import wubaSpider.pojo.AnjukePojo;
import Util.AnjukeDataUtil;
import Util.AnjukeDataUtil2;
import bossSpider.Blibli_Selemium;
public class Anjuke_spider3 implements PageProcessor{
List<String> list=new ArrayList<String>();
List<String> list2=new ArrayList<String>();
int z=0;
private Site site = Site.me().setRetryTimes(1).setSleepTime(20000);
public Site getSite() {
return site;
}
public void process(Page page) {
page.putField("author", page.getUrl().toString());
if (page.getResultItems().get("name") == null) {
page.setSkip(true);
}
z++;
String a=page.getHtml().css("h1").toString();
String a0=page.getHtml().getDocument().getElementsByClass("basic-infos-box").toString();
if (a!=null||a0!=null) {
String str=a+a0+"/001";
list2.add(str);
System.out.println(str);
}else {
list2.add(list.get(z-1)+"/001");
}
IOwriteData("G:五八爬虫数据", "list10Demo223333.txt", "utf-8", removeDuplicate(list2));
System.out.println("已经进行第"+z+"次爬取");
HashSet<String>set=new HashSet<String>();
String path="G:\\五八爬虫数据\\房地产\\demo";
try {
AnjukeDataUtil.getCollectionsData(path, set);
list.addAll(set);
} catch (Exception e) {
e.printStackTrace();
}
page.addTargetRequests(removeDuplicate(list));
}
public static void main(String[] args) throws IOException {
//创建爬虫实体类
Anjuke_spider3 selemium=new Anjuke_spider3();
//设置selemium浏览器配置驱动
SeleniumDownloader seleniumDownloader=new SeleniumDownloader("G:\\爬虫\\drive\\chromedriver.exe");
//配置当前浏览器配置
System.setProperty("selenuim_config", "G:\\workspace\\Git\\webmagic\\config.ini");
//request类型配置,使用responsebody配置请求头,调用method方法选择post/get请求
Request request=new Request();
//设置被爬取页面
String st="https://baoding.anjuke.com/community/view/1019536";
//爬虫入口
Spider.create(selemium)
.addUrl(st)
.setDownloader(seleniumDownloader)
.thread(1)//启动n个线程(此语句表示启动3个线程)
.run();//启动爬虫,会阻塞当前线程执行(及n个线程不是同时执行的)
// seleniumDownloader.close();
}
private static void IOwriteData(String dataPath,String fileName,String charset,List<String> list){
String str;
FileOutputStream fos = null;
OutputStreamWriter writer = null;
try {
fos=new FileOutputStream(dataPath+fileName, true);
writer = new OutputStreamWriter(fos,charset);
for (String string : list) {
str = string;
writer.append(str);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
writer.close();
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static List