java爬虫 爬取虎嗅网站页面信息 采用分布式 多线程来进行(继续上篇博客写的上游 这个写的是下游)

图片分析上下游

pom

<dependencies>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
        <!-- jsoup HTML parser library @ https://jsoup.org/ -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>4.2.6.RELEASE</version>
        </dependency>
 
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.41</version>
        </dependency>
        <dependency>
            <groupId>c3p0</groupId>
            <artifactId>c3p0</artifactId>
            <version>0.9.1.2</version>
        </dependency>
 
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.31</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.1</version>
        </dependency>
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.9.0</version>
        </dependency>
    </dependencies>
     
    <build>
      <plugins>
          <plugin>
              <artifactId>maven-assembly-plugin</artifactId>
              <configuration>
                  <archive>
                      <manifest>
                          <mainClass>com.spider.consumer.main.ConsumerMain</mainClass>
                      </manifest>
                  </archive>
                  <descriptorRefs>
                      <descriptorRef>jar-with-dependencies</descriptorRef>
                  </descriptorRefs>
              </configuration>
          </plugin>
      </plugins>
  </build>

代码如下:

package com.spider.consumer.main;


import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;


import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;




public class ConsumerMain {
public static final ExecutorService threadPool = Executors.newFixedThreadPool(10);//在线程池里面创建10线程
public static void main(String[] args) {
//创建10个线程
for(int i=0;i<10;i++){
threadPool.execute(new ProcessPagingThreadQueue());
}

}
//   这个方法就相当是拿到了与redis连接的对象
public static Jedis getJedis(){
//创建redis对象  要提前在redis里面存有数值 主要是防止redis里面的的数据不足线程池里面的线程取的 如果线程没有取到值那么程序会报错的
JedisPoolConfig config = new JedisPoolConfig();
config.setMaxTotal(20);
//创建与redis的连接
JedisPool jedisPool = new JedisPool(config,"192.168.22.87",6379);
return jedisPool.getResource();//返回连接的对象
}
}

线程类

package com.spider.consumer.main;


import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;


public class ProcessPagingThreadQueue extends Thread {


public static final ArticleDao articleDao = new ArticleDao();


// 线程类的方法 这个方法主要是获得redis里面的id值 并且得到文章的详细的内容 并且存在数据库中去
public void run() {
// TODO Auto-generated method stub
while (true) {
try {
// 得到redis里面存储的id值
String aid = ConsumerMain.getJedis().rpop("aid");
// 判断取出的id对应的每一详细的页面信息中是否存在url
if (!ConsumerMain.getJedis().sismember("setaid", aid)) {
// 创建发送请求
HttpGet httpGet = new HttpGet("https://www.huxiu.com/article/" + aid + ".html");
// user-agent
httpGet.addHeader("user-agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
// 执行请求获取html的文档
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse execute = httpClient.execute(httpGet);
String html = null;
if (execute.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = execute.getEntity();
html = EntityUtils.toString(entity);
}
// 解析html得到html里面的主要的内容信息
Article article = parseSinglePage(html);
if(article != null){
article.setId(aid);
article.setUrl("https://www.huxiu.com/article/" + aid + ".html");
}
//往服务器里面写入数据
if(article != null){
articleDao.save(article);
}
//一个id使用了之后防止重复的爬取 也就是防止重复重复的爬取网页的一样的信息
ConsumerMain.getJedis().sadd("setaid", aid);
}
} catch (Exception e) {
// TODO: handle exception
}
}
}


private Article parseSinglePage(String html) {
// TODO Auto-generated method stub
try {
Article article = new Article();
if (html != null) {
// 将详细页面的信息 转换为文档对象
Document document = Jsoup.parse(html);
// 获取文章的标题信息
String ownText = document.select(".t-h1").get(0).ownText();
article.setTitle(ownText);
// 获取作者
String author = document.select(".author-name").get(0).text();
article.setAuthor(author);
// 获取时间 根据页面上的信息可知时间有两种表示
Elements elements = document.select("span[class=article-time pull-left]");
if (elements.size() == 0) {
String createTime = document.select(".article-time").get(0).ownText();
article.setCreateTime(createTime);
} else {
String createTime = elements.get(0).ownText();
article.setCreateTime(createTime);
}
// 获取文章内容
String content = document.select(".article-content-wrap").get(0).text();
article.setContent(content);
// 获取点赞
article.setZan(document.select(".num").get(0).ownText());
// 获取评论
article.setPl(document.select(".article-pl").get(0).ownText());
return article;
}
} catch (Exception e) {
// TODO: handle exception
System.out.println("解析html数据获得文章内容失败");
}
return null;
}


}

实体类

package com.spider.consumer.main;


public class Article {
private String id;
private String title;
private String author;
private String createTime;
private String sc;
private String zan;
private String pl;
private String content;
private String url;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getCreateTime() {
return createTime;
}
public void setCreateTime(String createTime) {
this.createTime = createTime;
}
public String getSc() {
return sc;
}
public void setSc(String sc) {
this.sc = sc;
}
public String getZan() {
return zan;
}
public void setZan(String zan) {
this.zan = zan;
}
public String getPl() {
return pl;
}
public void setPl(String pl) {
this.pl = pl;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Article [id=" + id + ", title=" + title + ", author=" + author + ", createTime=" + createTime + ", sc="
+ sc + ", zan=" + zan + ", pl=" + pl + ", content=" + content + ", url=" + url + "]";
}

}

数据库连接

package com.spider.consumer.main;


import org.springframework.jdbc.core.JdbcTemplate;


import com.mchange.v2.c3p0.ComboPooledDataSource;


public class ArticleDao extends JdbcTemplate{
public ArticleDao() {
// 创建C3P0的datasource 1.配置 2.代码
ComboPooledDataSource dataSource = new ComboPooledDataSource();
// 1.url
// 2.driver
// 3.username&password
dataSource.setUser("root");
dataSource.setPassword("123");
dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");
setDataSource(dataSource);
}
public void save(Article article) {
String sql = "INSERT INTO huxiu_article (id, title, author, createTime, zan, pl, sc, content, url ) VALUES( ?,?,?,?,?,?,?,?,?)";
update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());
}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值