java爬虫

Java爬虫,存入数据库,且写入文件存入txt;

WyouSubUrl.java    ///部分代码//获取网页数据

public class WyouSubUrl implements PageProcessor {

// 抓取网站的相关配置,包括编码、抓取间隔、重试次数等

private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

.........

@Override

public void process(Page page) {

if (page.getUrl().regex("^https://jobs\\.51job\\.com").match()) {

// 加入满足条件的链接,得到当前job详情页的部分信息;

page.putField("jobname", page.getHtml().xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()").toString());//jobname

page.putField("address",page.getHtml().xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/span/text()").toString());//address

page.putField("company", page.getHtml().xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a/text()").toString());//com_name

page.putField("information", page.getHtml().xpath("/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()").toString());

}

else {// 列表页 eg java页 // 招聘信息url

page.addTargetRequests(

page.getHtml().xpath("//div[@class='el']/p/span/a/@href").all());

System.out.println(page.getHtml().xpath("//div[@class='el']/p/span/a/@href").all().size());

i++;

// 翻页url

/*if(i==50)

return;

else

page.addTargetRequest("https://search.51job.com/list/090200%252C010000,000000,0000,00,9,99,java,2,"+i+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=");

            */

}

    public static void main(String[] args) {

Spider.create(new WyouSubUrl()).addUrl(homeurl).addPipeline(new MyyPipeline())

.thread(4).run();homeurl是指主页各种职业列表的url

}

}

 

//自定义实现Pipeline接口

class MyyPipeline implements Pipeline {

@Override

public void process(ResultItems resultItems, Task task) {

Map<String, Object> mapResults = resultItems.getAll();

Iterator<Entry<String, Object>> iter = mapResults.entrySet().iterator();

Map.Entry<String, Object> entry;

// 输出到控制台

while (iter.hasNext()) {

entry = iter.next();

System.out.println(entry.getKey() + ":" + entry.getValue());

}

// 持久化,将数据写入文件,方便进行分析处理

if (!mapResults.get("jobname").equals("")) {

News news = new News();///将数据封装成对象

news.setJobname((String)mapResults.get("jobname"));

news.setAddress((String)mapResults.get("address"));

news.setCompany((String)mapResults.get("company"));

news.setInformation((String)mapResults.get("information"));

 

FileWriter fileWriter;

try {///将爬取到的数据按格式存入txt文件

 fileWriter = new FileWriter("java.txt",true);

content="Java"+"\t"+news.getSalary()+"\t"+news.getExperience()+"\t"+news.getEducation()+"\t"+ news.getAddress() +  "\t" +news.getDemand();

     fileWriter.write(String.valueOf(content)+"\n");

     fileWriter.flush();

     fileWriter.close();

} catch (IOException e) {

e.printStackTrace();

}

DBUtils.WriteUserToDB(news);///将每个news对象写入数据库

}

}

}

 

 

DBUtils.java ///部分代码

public class DBUtils {

static NewsDao newdao = null;

static SqlSession session =null;

static {

try {

InputStream is = Resources.getResourceAsStream("mybatisconf.xml");

SqlSessionFactory sessionFactory = new SqlSessionFactoryBuilder().build(is);

session = sessionFactory.openSession(true);

System.out.println(session);

newdao = session.getMapper(NewsDao.class);

System.out.println(newdao);

} catch (IOException e) {

e.printStackTrace();

}

}

 

public static void WriteUserToDB(News news) {

newdao.AddNews(news);///执行此函数,将news插入数据库

session.commit();

}

}

 

New.java///封装对象

public class News {

private int id;

    private String jobname;

    private String address;

    private String company;

private String information;

........//(get与set方法)

}

 

NewDao.java

public interface NewsDao {

   public int AddNews(News news);

}

 

NewsDaoMapperxml.xml///插入数据库

<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">

<mapper namespace="com.test.dao.NewsDao">

<insert id="AddNews" parameterType="com.test.entity.News">

    INSERT INTO jobs (jobname, address,company,information)

VALUES (#{jobname}, #{address},#{company},#{information})

</insert>

</mapper>

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值