1.items定义
class HistoryItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pid = scrapy.Field() # 项目id
name = scrapy.Field() # 项目
class ForecastItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pid = scrapy.Field() # 项目id
name = scrapy.Field() # 项目
2.文件编码
# coding=utf-8
# 文件开头
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
3.spider 写法演示(对原文件进行了修改删减,仅做演示)
# coding=utf-8
import scrapy
import sys
import cgi
from wanglu.items import ForecastItem, SupplyItem
reload(sys)
sys.setdefaultencoding('utf-8')
# scrapy crawl wang 命令
class WangSpider(scrapy.Spider):
name = "wang"
base_url= "http://www.sample.com/info?id=" #自己的网址
start_urls =[]
#从文件seedall.txt读id参数,每行一个
# def __init__(self):
# ids = open("seedall.txt","r").readlines()
# for did in ids:
# self.start_urls.append(self.base_url+did.replace("\n", "").strip())
#循环 id
def __init__(self):
for did in range(22312,23475):
self.start_urls.append(self.base_url+str(did))
def parse(self,response):
try:
time.sleep(1)
except BaseException as e:
e.__str__()
pid = response.url.split("=")[-1]
table = response.xpath("/html/body/div[@id='basicInfoDiv']/div/table[@class='jbxx_table'][1]")
item = WangluItem()
item["pid"]=response.url.split("=")[-1]
item['company_name']=table.xpath(".//td/label[@id='lblcname']/text()")[0].extract().strip().replace("\r\n", "")
if item["company_name"].strip() == '':
return
yield item
forecastTable =response.css("#businessPlanDiv div > table")[1]
if forecastTable and forecastTable.css("tr"):
for i,tr in enumerate(forecastTable.css("tr")):
if i > 1:
forecast = Forecast2Item()
forecast["pid"] = pid
forecast['year'] = cgi.escape( tr.xpath(".//td[1]/text()")[0].extract().strip().replace("\r\n", ""),True)
forecast['edu_bg'] = cgi.escape(tr.xpath('.//td[1]')[0].extract().strip().replace("\r\n", ""),True)
yield forecast
选择器文档链接
3.1 xpath 浏览器工具可以查看,下图为360浏览器获取xpath
但是这样获取的并不一定有效,而且可能很长,经常需要根据自己的情况修改xpath.
tbody 是一个小坑,因为源返回值中可能没有tbody ,但是浏览器会自动加上tbody, 造成路径问题,要注意。
3.2 css 选择器, “>” 直接子元素 ,”:nth-child()” 我较常用
注意 :xpath和css方法返回的都是”数组”
cgi.escape(str,True) 将html特殊符号和引号转义,在保存数据到数据库是很有用
取文本 :
tr.xpath(".//td/text()")
tr.css("td:: text")
取元素 (td中包含其他标签时):
tr.xpath(".//td")
tr.css("td")
取属性:
supply['file_url'] =tr.xpath(".//td[3]/a/@href")
supply['file_url'] =tr.css("a::attr(href)")
更多更详细请查文档
4.pipelines
log
import logging
logger = logging.getLogger('WLog')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
fh = logging.FileHandler("error.log", "a+", "utf-8")
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# 给logger添加handler
# self.logger.addHandler(ch)
logger.addHandler(fh)
将结果以json格式保存到文件
import json
class FilePipeline(object):
def __init__(self):
self.file = open('items.jl', 'a+')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
def close_spider(self, spider):
self.file.close()
保存到EXCEL
from openpyxl import Workbook
class ExcelPipeline(object):
def __init__(self):
try:
self.wb = Workbook()
# grab the active worksheet
self.ws = self.wb.active
except BaseException as e:
logger.warn(e.__str__())
def process_item(self, item, spider):
try:
self.ws.append([item['did'], item["company_name"], item["industry"], item["area"]])
except BaseException as e:
logger.warn(item["did"] + ":" + e.__str__())
return item
def close_spider(self, spider):
try:
self.wb.save(time.strftime("%y%m%d%H%M%S") + ".xlsx")
except BaseException as e:
logger.warn(e.__str__())
保存到数据库
import logging
import MySQLdb
# 保存到数据库
class MySQLPipeline(object):
def __init__(self, db):
self.db = db
@classmethod
def from_settings(cls, settings):
db = MySQLdb.connect(settings["MYSQL_HOST"], settings["MYSQL_USER"],settings["MYSQL_PASSWD"],settings["MYSQL_DBNAME"], charset="utf8")
return cls(db)
# pipeline默认调用
def process_item(self, item, spider):
cursor = self.db.cursor()
item["pid"] = int(item["pid"])
if isinstance(item, HistoryItem):
try:
cursor.execute('insert into history(pid,name,v2013,v2014,v2015) VALUES (%d, "%s", "%s", "%s", "%s")' % (item['pid'], item["name"], item["v2013"], item["v2014"], item["v2015"]))
except BaseException as e:
self.db.rollback()
logger.error(str(item["pid"]) + ":HistoryItem:" + e.__str__())
pass
elif isinstance(item, ForecastItem):
cursor.execute("select id from forecast where pid = %d" % (item["pid"]))
ret = cursor.fetchone()
try:
if ret: # 已有pid 更新
cursor.execute('update ..........'))
else
cursor.execute('insert ..........'))
except BaseException as e:
self.db.rollback()
logger.error(str(item["pid"]) + ":ForecastItem:" + e.__str__())
pass
else:
pass
return item
def close_spider(self, spider):
self.db.close()
当字段众多时如何更简便的保存到数据库?知道的留言告知下多谢。(17年8月28日更新:突然想到ORM,百度了一下果然python也有ORM框架SQLAlchemy、Django,以后可以试下怎么用)
拼接表的字段
SELECT group_concat(COLUMN_NAME) from information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';//得到逗号分隔的字符串
SELECT group_concat(COLUMN_NAME SEPARATOR "='%s',") from information_schema.COLUMNS where table_name = 'business_plan' and table_schema = 'test';
5 Settings (截取我设置了的)
# -*- coding: utf-8 -*-
BOT_NAME = 'wanglu'
SPIDER_MODULES = ['wanglu.spiders']
NEWSPIDER_MODULE = 'wanglu.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 1
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'wanglu.pipelines.SomePipeline': 300,
#}
#ITEM_PIPELINES = {'wanglu.pipelines.WangluPipeline': 300}
#ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline': 400}
#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300,'wanglu.pipelines.MySQLPipeline': 400}
#ITEM_PIPELINES = {'wanglu.pipelines.ExcelPipeline': 300}
ITEM_PIPELINES = {'wanglu.pipelines.MySQLPipeline2': 300}
# start MySQL database configure setting
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'test'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123123'
# end of MySQL database configure setting