之前学习用的,爬取东方资讯新闻列表的代码
ArticlelistItem文件中的字段可根据自己的需要改变
# -*- coding: utf-8 -*-
import scrapy
import json
import time
import pymysql
from articlelist.items import ArticlelistItem
class EastdaySpider(scrapy.Spider):
# 东方资讯列表
name = 'eastday'
allowed_domains = ['mini.eastday.com']
start_urls = ['http://mini.eastday.com/']
def __init__(self):
# connection database
self.conn = pymysql.connect(host="localhost", database="test", user="root", password="root",
charset="utf8")
# get cursor
self.cursor = self.conn.cursor()
def start_requests(self):
sql = "SElECT id,cate_name,url FROM test_article_cate WHERE state=1"
self.cursor.execute(sql)
res = self.cursor.fetchall()
self.cursor.close()
self.conn.close()
for item in res:
cid = item[0]
cate_name = item[1]
req_url = item[2]
if req_url:
meta = {
"cid": cid,
"cate_name": cate_name,
"cate_qur": req_url
}
qid = time.time()
for i in range(1, 25):
pg = i
url = "https://toutiao.eastday.com/toutiao_h5/RefreshJP?type={cate}&recgid=15665241100115754&qid={qid}&idx=0&pgnum={pg}&os=iOS+11_0".format(
cate=req_url, qid=qid, pg=pg)
yield scrapy.Request(url, callback=self.parse, meta=meta)
def parse(self, response):
data = json.loads(response.body_as_unicode())
cate_id = response.meta["cid"]
cate_name = response.meta["cate_name"]
new_list = data["data"]
for item in new_list:
if item["video_link"]:
pass
else:
news_item = ArticlelistItem()
news_item["title"] = item["topic"]
news_item["url"] = str(item["url"])
news_item["author"] = item["source"]
news_item["imgs"] = item["miniimg"][0]["src"]
news_item["cid"] = cate_id
news_item["subtime"] = item["date"]
n_date = item["date"] + ":00"
timeArray = time.strptime(n_date, "%Y-%m-%d %H:%M:%S")
timeStamp = int(time.mktime(timeArray))
news_item["add_time"] = timeStamp
news_item["sub_time"] = timeStamp
print(news_item)
yield news_item
pass
上述爬虫用到的数据库代码如下
`# Host: 39.100.139.168 (Version 5.5.62-log)
Date: 2020-08-09 18:27:12
Generator: MySQL-Front 6.1 (Build 1.26)
Structure for table “article_cate”
DROP TABLE IF EXISTS article_cate
;
CREATE TABLE article_cate
(
id
int(11) NOT NULL AUTO_INCREMENT,
pid
smallint(6) NOT NULL DEFAULT ‘0’,
cate_name
varchar(255) DEFAULT NULL COMMENT ‘分类名’,
url
varchar(255) DEFAULT NULL COMMENT ‘采集地址’,
deep
tinyint(1) DEFAULT NULL COMMENT ‘层次’,
order_num
smallint(6) DEFAULT ‘0’,
state
tinyint(1) NOT NULL DEFAULT ‘1’,
PRIMARY KEY (id
)
) ENGINE=MyISAM AUTO_INCREMENT=49 DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC COMMENT=‘文章分类’;
Data for table “article_cate”
/*!40000 ALTER TABLE article_cate
DISABLE KEYS /;
INSERT INTO article_cate
VALUES (1,0,‘头条’,‘toutiao’,1,1,1),(2,0,‘社会’,‘shehui’,0,3,1),(3,0,‘国内’,‘guonei’,0,3,1),(4,0,‘国际’,‘guoji’,0,3,1),(5,0,‘娱乐’,‘yule’,0,3,1),(6,0,‘体育’,‘tiyu’,0,3,1),(7,0,‘军事’,‘junshi’,0,3,1),(8,0,‘科技’,‘keji’,0,3,1),(9,0,‘财经’,‘caijing’,0,3,1),(10,0,‘时尚’,‘shishang’,0,3,1),(11,0,‘健康’,‘jiankang’,0,3,1),(13,0,‘人文’,‘lishi’,0,3,1),(14,0,‘星座’,‘xingzuo’,0,3,1),(15,0,‘游戏’,‘youxi’,0,3,1),(16,0,‘科学’,‘kexue’,0,3,1),(17,0,‘互联网’,‘hulianwang’,0,3,1),(18,0,‘数码’,‘shuma’,0,3,1),(19,0,‘保健’,‘baojian’,0,3,1),(20,0,‘健身’,‘jianshen’,0,3,1),(21,0,‘饮食’,‘yinshi’,0,3,1),(22,0,‘减肥’,‘jianfei’,0,3,1),(23,0,‘CBA’,‘cba’,0,3,1),(24,0,‘德甲’,‘dejia’,0,3,1),(25,0,‘意甲’,‘yijia’,0,3,1),(26,0,‘网球’,‘wangqiu’,0,3,1),(27,0,‘中超’,‘zhongchao’,0,3,1),(28,0,‘西甲’,‘xijia’,0,3,1),(29,0,‘英超’,‘yingchao’,0,3,1),(30,0,‘棋牌’,‘qipai’,0,3,0),(31,0,‘高尔夫’,‘gaoerfu’,0,3,1),(32,0,‘排球’,‘paiqiu’,0,3,1),(33,0,‘羽毛球’,‘yumaoqiu’,0,3,1),(34,0,‘家居’,‘jiaju’,0,3,1),(35,0,‘外汇’,‘waihui’,0,3,1),(36,0,‘保险’,‘baoxian’,0,3,1),(37,0,‘不动产’,‘budongchan’,0,3,1),(38,0,‘黄金’,‘huangjin’,0,3,1),(39,0,‘新三板’,‘xinsanban’,0,3,1),(40,0,‘股票’,‘gupiao’,0,3,1),(41,0,‘期货’,‘qihuo’,0,3,1),(42,0,‘基金’,‘jijin’,0,3,1),(43,0,‘理财’,‘licai’,0,3,1),(44,0,‘电影’,‘dianying’,0,3,1),(45,0,‘电视’,‘dianshi’,0,3,1),(46,0,‘八卦’,‘bagua’,0,3,1),(48,0,‘视频’,NULL,0,2,2);
/!40000 ALTER TABLE article_cate
ENABLE KEYS */;
`
爬取新闻详情的代码
ArticlelistItem文件中的字段可根据自己的需要改变
import scrapy
import pymysql
from articleitem.items import ArticleitemItem
class EastdaySpider(scrapy.Spider):
# 东方资讯
name = 'eastday'
allowed_domains = ['mini.eastday.com']
start_urls = ['https://mini.eastday.com/']
def __init__(self):
# connection database
self.conn = pymysql.connect(host="localhost", database="test", user="root", password="root",
charset="utf8")
# get cursor
self.cursor = self.conn.cursor()
def start_requests(self):
sql = "SElECT id,url FROM test_article WHERE is_spider=0 LIMIT 2000"
self.cursor.execute(sql)
res = self.cursor.fetchall()
self.cursor.close()
self.conn.close()
for item in res:
id = item[0]
req_url = item[1]
meta = {
"id": id,
"url": req_url,
}
url = req_url
yield scrapy.Request(url, callback=self.parse, meta=meta)
def parse(self, response):
id = response.meta["id"]
url = response.meta["url"]
j_article = response.css("article#J_article").extract_first()
news_item = ArticleitemItem()
news_item["content"] = j_article
news_item["id"] = id
news_item["url"] = url
yield news_item
pass