1.获取分类的名称和url(text:名字,href:链接)
$url="http://books.toscrape.com/";
$data=QueryList::get($url)->rules([
'categoryname' => ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a','text'],
'url'=>['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a','href']
])->query()->getData();
复制代码
2.通过图书url获取图书信息
(1)获取单页数据,因url不完整,则需要拼接
//每页数据
$data=QueryList::get($url)->rules([
'bookname'=>['#default > div > div > div > div > section > div:nth-child(2) > ol > li: > article > h3 > a','title'],
'bookprice'=>['#default > div > div > div > div > section > div:nth-child(2) > ol > li: > article > div.product_price > p.price_color','text']
])->queryData();
复制代码
(2)获取下一页链接,判断是否有下一页,若有,拼接链接
strrpos("str","/")查询字符串str中,最后一个“/”所在位置。
substr("str",num)从第num个字符开始截取字符,到最后一位。
$nexturl=QueryList::get($url)->find('#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a')->href;
//判断是否有下一页
if($nexturl!=NULL&&$nexturl!=''){ //链接修改
$urll=$nexturl;
$str=substr($url,strrpos($url,"/")+1);
$url=str_replace($str,$urll,$url);
$num==1;
}
复制代码
(3)通过do-while循环获取所有的图书信息
$alldata=array();
do{
$num=0;
//每页数据
$data=QueryList::get($url)->rules([
'bookname'=>['#default > div > div > div > div > section > div:nth-child(2) > ol > li: > article > h3 > a','title'],
'bookprice'=>['#default > div > div > div > div > section > div:nth-child(2) > ol > li: > article > div.product_price > p.price_color','text']
])->queryData();
$alldata=array_merge($alldata,$data);
//print_r($data);
$nexturl=QueryList::get($url)->find('#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a')->href;
//判断是否有下一页
if($nexturl!=NULL&&$nexturl!=''){ //链接修改
//print($nexturl);
$urll=$nexturl;
$str=substr($url,strrpos($url,"/")+1);
$url=str_replace($str,$urll,$url);
$num=1;
//print_r($url."\n");
}
}while($num==1);
复制代码
3.将爬取的数据存入mysql数据库
(1)创建数据库和表格
create database bookstore character set utf8;
use bookstore;
CREATE TABLE `category` (
`categoryid` int(11) NOT NULL AUTO_INCREMENT,
`categoryname` varchar(255) NOT NULL,
PRIMARY KEY (`cid`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
CREATE TABLE `books` (
`bookid` int(11) NOT NULL AUTO_INCREMENT,
`bookname` varchar(255) NOT NULL,
`bookprice` varchar(10) NOT NULL,
`categoryid` int(11) NOT NULL,
PRIMARY KEY (`bid`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
复制代码
(2)PHP连接数据库
// 创建mysql连接
$servername = "localhost"; // 本地主机
$username = "root"; // 用户名
$password = "root"; // 用户密码
$dbname = "bookstore"; // 数据库名字
$conn = mysqli_connect($servername, $username, $password, $dbname);
mysqli_query($conn,"SET NAMES utf8");
复制代码
(3)将数据写入数据库
单条数据插入 mysqli_query,写入图书类,获取图书类ID
$sqltype="insert into category(categoryname) values (\"{$dtype['categoryname']}\")";
mysqli_query($conn,$sqltype);
//分类ID
$typeid=mysqli_insert_id($conn);
复制代码
多条数据插入 mysqli_multi_query,写入图书信息
foreach ($alldata as $key=>$value){
$sql.="insert into books(bookname,bookprice,categoryid) values (\"{$value['bookname']}\",\"{$value['bookprice']}\",$typeid);";
}
$result=mysqli_multi_query($conn,$sql);
复制代码
判断是否成功
if($result){echo"写入成功";}
else{echo"写入失败";}
复制代码
完成
4.完整代码
require 'vendor/autoload.php';
use QL\QueryList;
//$url="http://books.toscrape.com/";
function get_url_name($url){//获取分类链接和名字
$data=QueryList::get($url)->rules([
'categoryname' => ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a','text'],
'url'=>['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a','href']
])->query()->getData();
//print(count($data));
//print_r($data->all());
return $data;
}
function get_bookname_price($url){
$alldata=array();
do{
$num=0;
//每页数据
$data=QueryList::get($url)->rules([
'bookname'=>['#default > div > div > div > div > section > div:nth-child(2) > ol > li: > article > h3 > a','title'],
'bookprice'=>['#default > div > div > div > div > section > div:nth-child(2) > ol > li: > article > div.product_price > p.price_color','text']
])->queryData();
$alldata=array_merge($alldata,$data);
//print_r($data);
//获取下一页链接
$nexturl=QueryList::get($url)->find('#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a')->href;
//判断是否有下一页
if($nexturl!=NULL&&$nexturl!=''){ //链接修改
//print($nexturl."\n");
$urll=$nexturl;
$str=substr($url,strrpos($url,"/")+1);
$url=str_replace($str,$urll,$url);
$num=1;
//print_r($url."\n");
}
}while($num==1);
print (count($alldata)."\n");
/* $i=1;
foreach ($alldata as $d){
print_r($i++."\t".$d['bookname']."\t\t".$d['bookprice']."\n");
}*/
return $alldata;
}
//get_bookname_price('http://books.toscrape.com/catalogue/category/books/mystery_3/index.html');
function start(){
$url="http://books.toscrape.com/";
$data=get_url_name($url);
// 创建mysql连接
$servername = "localhost"; // 本地主机
$username = "root"; // 用户名
$password = "root"; // 用户密码
$dbname = "bookstore"; // 数据库名字
$conn = mysqli_connect($servername, $username, $password, $dbname);
mysqli_query($conn,"SET NAMES utf8");
$sql='';
$sqltype='';
//$bookname='';
//$bookprice='';
foreach ($data as $key=>$dtype){
print_r($dtype['categoryname']."\n");
// $categoryname=$dtype['categoryname'];
$sqltype="insert into category(categoryname) values (\"{$dtype['categoryname']}\")";
mysqli_query($conn,$sqltype);
//分类ID
$typeid=mysqli_insert_id($conn);
$alldata=get_bookname_price( "http://books.toscrape.com/".$dtype['url']);
$url="http://books.toscrape.com/".$url;
foreach ($alldata as $key=>$value){
//print_r($data);
// print_r("\t\"".$value['bookname']."\"\t\t".$value['bookprice']."\n");
//$bookname=$value['bookname'];
//$bookprice=$value['bookprice'];
$sql.="insert into books(bookname,bookprice,categoryid) values (\"{$value['bookname']}\",\"{$value['bookprice']}\",$typeid);";
}
}
echo($sql);
$result=mysqli_multi_query($conn,$sql);
if($result){
echo"写入成功";
}else{
echo"写入失败";
}
}
start();
复制代码