利用scrapy框架吧数据存入到文本,json,csv,mysql等
- 存入到文本中
class TextPipeline(object):
def open_spider(self,spider):
self.fp=open('./sun.txt','w',encoding='utf-8')
def process_item(self,item,spider):
dic=dict(item)
self.fp.write(dic+"\n")
return item
def close_spider(self,spider):
self.fp.close()
2.存入到json中
class JsonPipeline(object):
def open_spider(self,spider):
self.fp=open("sun.json",'w',encoding='utf-8')
def process_item(self,item,spider):
dic=dict(item)
line=json.dumps(dic,ensure_ascii=False)+"\n"
self.fp.write(line)
return item
def close_spider(self,spider):
self.fp.close()
3.存入到csv中
class CsvPipeline(object):
def open_spider(self,spdier):
self.fp=open("./sun.csv","w",encoding='utf-8',newline='')
self.writer=csv.writer(self.fp)
def process_item(self,item,spider):
line=(item['name_size'],item['name_min'],item['name_url'],item['book_img'],item['book_source'],item['book_talk'],item['book_press'])
self.writer.writerow(line)
return item
4.存入到mysql中
class MysqlPipeline(object):
def open_spider(self):
self.conn=pymysql.Connect(host='localhost',post=3306,user='root',password='123456',db='sun',charset='utf8')
def process_item(self,item,spider):
self.cursor=self.conn.cursor()
sql='insert into sun values("%s","%s","%s","%s","%s","%s","%s") %(item["name_min"],item["name_url"],item["book_img"],item["book_source"],item["book_talk"],item["book_press"],item["name_size"])'
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
self.conn.rollback()
return item
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
5.存入到MongDB中
class MongoPipeline(object):
def open_spider(self,spider):
self.client=pymongo.MongoClient(host='localhost',port=27017)
db=self.client.sun
self.collection=db.sunning
def process_item(self,item,spider):
dic=dict(item)
self.collection.insert(dic)
def close_spider(self):
self.client.close()
6.存入到Redis中
from scrapy.utils.project import get_project_settings
class RedisPipeline(object):
def open_spider(self):
settings=get_project_settings()
self.conn=redis.StrictRedis(host=settings['REDIS_HOST'],post=settings['REDIS_POST'])
def process_item(self,item,spider):
dic=dict(item)
self.conn.lpush("Sun",dic)
return item
def close_item(self,spider):
self.conn.connection_pool.disconnect()
6.1在settings中设置
REDIS_HOST='localhost'
REDIS_POST='6379'