Java爬虫,存入数据库,且写入文件存入txt;
WyouSubUrl.java ///部分代码//获取网页数据
public class WyouSubUrl implements PageProcessor { // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(100); ......... @Override public void process(Page page) { if (page.getUrl().regex("^https://jobs\\.51job\\.com").match()) { // 加入满足条件的链接,得到当前job详情页的部分信息; page.putField("jobname", page.getHtml().xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()").toString());//jobname page.putField("address",page.getHtml().xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/span/text()").toString());//address page.putField("company", page.getHtml().xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a/text()").toString());//com_name page.putField("information", page.getHtml().xpath("/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()").toString()); } else {// 列表页 eg java页 // 招聘信息url page.addTargetRequests( page.getHtml().xpath("//div[@class='el']/p/span/a/@href").all()); System.out.println(page.getHtml().xpath("//div[@class='el']/p/span/a/@href").all().size()); i++; // 翻页url /*if(i==50) return; else page.addTargetRequest("https://search.51job.com/list/090200%252C010000,000000,0000,00,9,99,java,2,"+i+".html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="); */ } public static void main(String[] args) { Spider.create(new WyouSubUrl()).addUrl(homeurl).addPipeline(new MyyPipeline()) .thread(4).run();homeurl是指主页各种职业列表的url } }
//自定义实现Pipeline接口 class MyyPipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { Map<String, Object> mapResults = resultItems.getAll(); Iterator<Entry<String, Object>> iter = mapResults.entrySet().iterator(); Map.Entry<String, Object> entry; // 输出到控制台 while (iter.hasNext()) { entry = iter.next(); System.out.println(entry.getKey() + ":" + entry.getValue()); } // 持久化,将数据写入文件,方便进行分析处理 if (!mapResults.get("jobname").equals("")) { News news = new News();///将数据封装成对象 news.setJobname((String)mapResults.get("jobname")); news.setAddress((String)mapResults.get("address")); news.setCompany((String)mapResults.get("company")); news.setInformation((String)mapResults.get("information"));
FileWriter fileWriter; try {///将爬取到的数据按格式存入txt文件 fileWriter = new FileWriter("java.txt",true); content="Java"+"\t"+news.getSalary()+"\t"+news.getExperience()+"\t"+news.getEducation()+"\t"+ news.getAddress() + "\t" +news.getDemand(); fileWriter.write(String.valueOf(content)+"\n"); fileWriter.flush(); fileWriter.close(); } catch (IOException e) { e.printStackTrace(); } DBUtils.WriteUserToDB(news);///将每个news对象写入数据库 } } }
|
DBUtils.java ///部分代码
public class DBUtils { static NewsDao newdao = null; static SqlSession session =null; static { try { InputStream is = Resources.getResourceAsStream("mybatisconf.xml"); SqlSessionFactory sessionFactory = new SqlSessionFactoryBuilder().build(is); session = sessionFactory.openSession(true); System.out.println(session); newdao = session.getMapper(NewsDao.class); System.out.println(newdao); } catch (IOException e) { e.printStackTrace(); } }
public static void WriteUserToDB(News news) { newdao.AddNews(news);///执行此函数,将news插入数据库 session.commit(); } } |
New.java///封装对象
public class News { private int id; private String jobname; private String address; private String company; private String information; ........//(get与set方法) } |
NewDao.java
public interface NewsDao { public int AddNews(News news); } |
NewsDaoMapperxml.xml///插入数据库
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="com.test.dao.NewsDao"> <insert id="AddNews" parameterType="com.test.entity.News"> INSERT INTO jobs (jobname, address,company,information) VALUES (#{jobname}, #{address},#{company},#{information}) </insert> </mapper>
|