php采集新闻数据插入数据库,PHP采集www.php.cn的文章,并存入数据库。

[PHP]代码<?php

class Fork36kr

{

private $start;

private $end;

private $number = 0;

private $dsn;

private $user;

private $password;

private $pdo;

/**

* @param int $start 采集起点文章id

* @param int $end 采集终点文章id

* @param string $dsn PDO数据源

* @param string $user 数据库用户名

* @param string $password 数据库密码

*/

public function __construct($start=200100, $end=206670,$dsn='',$user='',$password='')

{

$this->start = $start;

$this->end = $end;

$this->dsn = $dsn;

$this->user = $user;

$this->password = $password;

if($dsn)

{

$this->pdo = new PDO($this->dsn,$this->user,$this->password);

}

}

public function fork()

{

$ch = curl_init();

curl_setopt($ch, CURLOPT_HEADER, 0);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

echo "<<<<< OK. Start Fork 36kr >>>>>\n";

for($i=$this->start;$i<=$this->end;$i++)

{

$url = "http://www.36kr.com/p/".$i.".html";

curl_setopt($ch, CURLOPT_URL, $url);

$page = curl_exec($ch);

if(curl_getinfo($ch)['http_code']==200)

{

$t = preg_match('#

.*

#', $page, $title);

$c = preg_match('#

.*

#Us', $page, $content);

if($t&&$c)

{

$title = strip_tags($title[0]);

$content = strip_tags($content[0]);

//$content = strip_tags($content[0],'

'); //保留

echo $url.','.$title."\n";

$this->number++;

}

if($this->dsn)

{

$this->save($title,$content,$url);

}

}

}

echo '<<<< Fork Over! Total: '.$this->number.' >>>>';

}

private function save($title,$content,$url)

{

$sql = "INSERT INTO `36kr` (`id`,`title`,`content`,`url`) VALUES (null,:title,:content,:url)";

$stmt = $this->pdo->prepare($sql);

$stmt->bindParam(':title',$title);

$stmt->bindParam(':content',$content);

$stmt->bindParam(':url',$url);

$stmt->execute();

}

}

$dsn = 'mysql:host=localhost;dbname=test';

$user = 'root';

$password= 'root';

$kr = new Fork36kr(200100,206670,$dsn,$user,$password);

$kr->fork();

[文件]phpcn.sql-- phpMyAdmin SQL Dump

-- version 4.0.5

-- http://www.php.cn/

--

-- 主机: localhost

-- 生成日期: 2013 �?10 �?03 �?00:36

-- 服务器版本: 5.6.12-log

-- PHP 版本: 5.5.3

SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";

SET time_zone = "+00:00";

/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;

/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;

/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;

/*!40101 SET NAMES utf8 */;

--

-- 数据库: `test`

--

-- --------------------------------------------------------

--

-- 表的结构 `36kr`

--

CREATE TABLE IF NOT EXISTS `36kr` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`title` varchar(128) NOT NULL,

`content` text NOT NULL,

`url` varchar(128) NOT NULL,

PRIMARY KEY (`id`)

) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;

/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;

/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;

/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;

相关标签:php

本文原创发布php中文网,转载请注明出处,感谢您的尊重!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值