当我们创建完项目之后,首先使用命令行创建一个spider:
scrapy genspider spider名称 www.xxxx.com
Scrapy会自动在项目文件的spider下生产一个py文件,它继承了scrapy.Spider,模板主要的内容有以下几项:
-
name
: 用于区别Spider。 该名字必须是唯一的,您不可以为不同的Spider设定相同的名字。 start_urls
: 包含了Spider在启动时进行爬取的url列表。 因此,第一个被获取到的页面将是其中之一。 后续的URL则从初始的URL获取到的数据中提取。parse()
是spider的一个方法。 被调用时,每个初始URL完成下载后生成的Response
对象将会作为唯一的参数传递给该函数。 该方法负责解析返回的数据(response data),提取数据(生成item)以及生成需要进一步处理的URL的Request
对象。
以下是爬取知乎信息的实例:
import re
import json
import datetime
import scrapy
from scrapy.loader import ItemLoader
try:
import urlparse as parse
except:
from urllib import parse
from ArticleSpider.items import ZhiHuAnswerItem,ZhiHuQuestionItem
class ZhihuSelSpider(scrapy.Spider):
name = 'zhihu_sel'
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com/']
#question 的answer第一页
start_answer_url = "https://www.zhihu.com/api/v4/questions/{0}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccollapsed_counts%2Creviewing_comments_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.is_blocking%2Cis_blocked%2Cis_followed%2Cvoteup_count%2Cmessage_thread_token%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit={1}&offset={2}"
headers = {
"HOST": "www.zhihu.com",
"Referer": "https://www.zhizhu.com",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
} #请求头
custom_settings = {
"COOKIES_ENABLED": True
} #设置
def parse(self, response):
"""
提取html页面中的所有url,并跟踪这些url进行进一步爬取
如果爬取的url中的格式为/question/xxx 就下载后直接进行解析
"""
all_urls = response.css("a::attr(href)").extract()
all_urls = [parse.urljoin(response.url,url) for url in all_urls]
all_urls = filter(lambda x:True if x.startswith("https")else False,all_urls)
for url in all_urls:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|&).*",url)
if match_obj:
print(url)
request_url = match_obj.group(1)
question_id = match_obj.group(2)
#如果提取到question相关的页面则下载后交由提取函数进行提取
yield scrapy.Request(request_url,headers=self.headers,meta={"zhihu_id":question_id},callback=self.parse_question)
else:
#如果不是question页面直接进一步跟踪
yield scrapy.Request(url,headers=self.headers,callback=self.parse)
def parse_question(self,response):
item_loader = ItemLoader(item=ZhiHuQuestionItem(),response=response)
item_loader.add_css("title","h1.QuestionHeader-title::text")
item_loader.add_css("content",".QuestionHeader-detail")
item_loader.add_value("url",response.url)
item_loader.add_value("zhihu_id",response.meta.get("zhihu_id",""))
item_loader.add_css("answer_num",".List-headerText span::text")
item_loader.add_css("comments_num",".QuestionHeaderActions button::text")
item_loader.add_css("watch_user_num",".NumberBoard-itemValue::text")
item_loader.add_css("topics",".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(response.meta.get("zhihu_id",""),20,0),headers=self.headers,callback=self.parse_answer)
yield question_item
def parse_answer(self,response):
#处理question的answer
ans_json = json.loads(response.text)
is_end = ans_json["paging"]["is_end"]
next_url = ans_json["paging"]["next"]
#提取answer的具体字段
for answer in ans_json["data"]:
answer_item = ZhiHuAnswerItem()
answer_item["zhihu_id"] = answer["id"]
answer_item["url"] = answer["url"]
answer_item["question_id"] = answer["question"]["id"]
answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else None
answer_item["content"] = answer["content"] if "content" in answer else None
answer_item["parise_num"] = answer["voteup_count"]
answer_item["comments_num"] = answer["comment_count"]
answer_item["create_time"] = answer["created_time"]
answer_item["update_time"] = answer["updated_time"]
answer_item["crawl_time"] = datetime.datetime.now()
yield answer_item
if not is_end:
yield scrapy.Request(next_url,headers=self.headers, callback=self.parse_answer)
配置items.py
import datetime
import scrapy
import re
from scrapy.loader.processors import MapCompose,TakeFirst,Join
from scrapy.loader import ItemLoader
from ArticleSpider.utils.common import extract_num
from ArticleSpider.settings import SQL_DATE_FORMAT,SQL_DATETIME_FORMAT
class ArticlespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
def date_convert(value):
"""将时间转换为data格式"""
try:
create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_nums(value):
"""取数字"""
match_re = re.match(".*?(\d+).*", value)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0
return nums
def remove_comment_tags(value):
#取出tags中提取的评论
if "评论" in value:
return ""
else:
return value
def return_value(value):
return value
class ArticleItemLoader(ItemLoader):
#自定义Itemloader
default_output_processor = TakeFirst()
class ZhiHuQuestionItem(scrapy.Item):
#知乎的问题Item
zhihu_id = scrapy.Field()
topics = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
answer_num = scrapy.Field()
comments_num = scrapy.Field()
watch_user_num = scrapy.Field()
click_num = scrapy.Field()
crawl_time = scrapy.Field()
def get_insert_sql(self):
# 插入知乎question表的sql语句
insert_sql = """
insert into zhihu_question(zhihu_id, topics, url, title, content, answer_num, comments_num,
watch_user_num, click_num, crawl_time
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE content=VALUES(content), answer_num=VALUES(answer_num), comments_num=VALUES(comments_num),
watch_user_num=VALUES(watch_user_num), click_num=VALUES(click_num)
"""
zhihu_id = self["zhihu_id"][0]
topics = ",".join(self["topics"])
url = self["url"][0]
title = "".join(self["title"])
content = "".join(self["content"])
answer_num = extract_num("".join(self["answer_num"]))
comments_num = extract_num("".join(self["comments_num"]))
if len(self["watch_user_num"]) == 2:
watch_user_num = int(self["watch_user_num"][0].replace(",","").strip())
click_num = int(self["watch_user_num"][1].replace(",","").strip())
else:
watch_user_num = int(self["watch_user_num"][0].replace(",","").strip())
click_num = 0
crawl_time = datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
params = (zhihu_id, topics, url, title, content, answer_num, comments_num,
watch_user_num, click_num, crawl_time)
return insert_sql, params
class ZhiHuAnswerItem(scrapy.Item):
#知乎的问题回答Item
zhihu_id = scrapy.Field()
url = scrapy.Field()
question_id = scrapy.Field()
author_id = scrapy.Field()
content = scrapy.Field()
parise_num = scrapy.Field()
comments_num = scrapy.Field()
create_time = scrapy.Field()
update_time = scrapy.Field()
crawl_time = scrapy.Field()
def get_insert_sql(self):
# 插入知乎question表的sql语句
insert_sql = """
insert into zhihu_answer(zhihu_id, url, question_id, author_id, content, parise_num, comments_num,
create_time, update_time, crawl_time
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE content=VALUES(content), comments_num=VALUES(comments_num), parise_num=VALUES(parise_num),
update_time=VALUES(update_time)
"""
create_time = datetime.datetime.fromtimestamp(self["create_time"]).strftime(SQL_DATETIME_FORMAT)
update_time = datetime.datetime.fromtimestamp(self["update_time"]).strftime(SQL_DATETIME_FORMAT)
params = (
self["zhihu_id"], self["url"], self["question_id"],
self["author_id"], self["content"], self["parise_num"],
self["comments_num"], create_time, update_time,
self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
)
return insert_sql, params
配置pipeline
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exporters import JsonItemExporter
from twisted.enterprise import adbapi
import codecs
import json
import MySQLdb
import MySQLdb.cursors
class ArticlespiderPipeline(object):
def process_item(self, item, spider):
return item
class MysqlPipeline(object):
# 采用同步的机制写入mysql
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1','root','password','article_spider',charset="utf8",use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self,item,spider):
insert_sql = """
insert into jobbole_article(title,url,create_date,fav_nums)
VALUES (%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql,(item["title"],item["url"],item["create_date"],item["fav_nums"]))
self.conn.commit()
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(clsc,setting):
dbparms = dict(
host =setting["MYSQL_HOST"],
db = setting["MYSQL_DBNAME"],
user = setting["MYSQL_USER"],
password = setting["MYSQL_PASSWORD"],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
return clsc(dbpool)
def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error,item,spider) #处理异常
def handle_error(self,failure,item,spider):
#处理异步插入的异常
print(failure)
def do_insert(self,cursor,item):
#执行具体的插入
# 根据不同的item 构建不同的sql语句并插入到mysql中
insert_sql,params = item.get_insert_sql()
cursor.execute(insert_sql, params)