一次临时需求,需要收集一批fb用户的fb_id,但是需要登录才能采集到信息,但是又不想重复登录,一是容易封号,二是降低效率。但是这种用法需要确保操作环境只有webdriver操作的chrome进程存在。操作步骤如下。
1.首先自定义user-data-dir初始化webDriver
//设置启动chrome为默认用户的配置信息(包括书签、扩展程序、代理设置等), 运行程序前需关闭win7系统中采用默认配置打开的浏览器chrome options.addArguments("user-data-dir="+dataDir);
2.进入登录界面,如果没有验证可以自动输入,如果又验证,手动登录。
String dir="D:\\test\\fb"; String ip="x.x.6.1"; String port="1984"; String url="https://www.facebook.com/horace.ckk/"; WebDriver webDriver = SeleniumUtils.initDriver(ip, port, null, null, dir); webDriver.get(url); ThreadUtils.sleep(2000*1000); webDriver.close();
接下来保留 user-data-dir就可以保留登录状态访问用户主页了。
package facebook;
import com.isi.utils.*;
import com.jfinal.plugin.activerecord.DbPro;
import com.jfinal.plugin.activerecord.Record;
import org.apache.log4j.Logger;
import org.openqa.selenium.WebDriver;
import java.util.List;
public class HomeSpider {
private static final Logger logger = Logger.getLogger(HomeSpider.class);
private List<String> getUrls(String path){
return MyIOUtils.readline(path,"utf-8");
}
private WebDriver getDriver(){
String dir="D:\\test\\fb";
String ip="x.x.6.1";
String port="1984";
WebDriver webDriver = SeleniumUtils.initDriver(ip, port, null, null, dir);
return webDriver;
}
private String download(String url,WebDriver webDriver){
webDriver.get(url);
ThreadUtils.sleep(5*1000);
return webDriver.getPageSource();
}
private void save(String pageSource,String url){
String md5 = StrUtils.md5(url);
String table="fb_page_html";
DbPro db = JfinalDbUtil.getDb(JfinalDbUtil.source_name);
boolean exist = exist(url);
if(!exist){
Record record = new Record();
record.set("url",url);
record.set("url_md5",md5);
record.set("content",StrUtils.filterEmoji(pageSource));
String fbId = getFbId(pageSource);
record.set("fb_id",fbId);
db.save(table,record);
logger.info("save db url====="+url);
}else {
logger.info("no need save ");
}
}
private boolean exist(String url){
String md5 = StrUtils.md5(url);
String table="fb_page_html";
DbPro db = JfinalDbUtil.getDb(JfinalDbUtil.source_name);
Record existRecord = db.findFirst("select 1 from " + table + " where url_md5=?", md5);
if(existRecord==null){
return false;
}else {
return true;
}
}
private static final String fbId_reg="\"container_id\":\"(\\d+)\"";
private String getFbId(String content){
return StrUtils.regexpExtract(content,fbId_reg);
}
private void exe(){
List<String> urls = getUrls("./conf/fb_home.txt");
WebDriver driver = getDriver();
for (String url : urls) {
try {
boolean exist = exist(url);
if(exist){
logger.info("already exist no need process url==="+url);
continue;
}
String pageSource = download(url, driver);
save(pageSource,url);
} catch (Exception e) {
e.printStackTrace();
}
}
driver.close();
}
public static void main(String[] args) {
HomeSpider homeSpider = new HomeSpider();
homeSpider.exe();
}
}