1.需求:获取糗事百科的作者和段子
2.准备:
- 创建和使用 Scrapy 工程
- `scrapy startproject qiubaiPro`
- 创建爬虫文件
- cd qiubaiPro
- scrapy genspider qiubai www.xxx.com
3.代码展示
import scrapy
from qiubaiPro.items import QiubaiproItem
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
allowed_domains = ['www.qiushibaike.com/']
start_urls = ['http://www.qiushibaike.com/text/']
def parse(self, response):
div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
author = div.xpath('.//div[@class="author clearfix"]//h2/text()')[0].extract()
content = div.xpath('.//div[@class="content"]/span//text()').extract()
content = ''.join(content)
item = QiubaiproItem()
item['auto'] = author
item['cont'] = content
yield item
import scrapy
class QiubaiproItem(scrapy.Item):
auto = scrapy.Field()
cont = scrapy.Field()
import pymysql
class QiubaiproPipeline(object):
fp = None
def open_spider(self,spider):
print('开始爬虫......')
self.fp = open('./qiubai.txt','w',encoding='utf8')
def process_item(self, item, spider):
author = item['auto']
content = item['cont']
self.fp.write(author+':'+content+'\n')
return item
def close_spider(self,spider):
print('结束爬虫')
self.fp.close()
class mysqlPipeline(object):
conn = None
cursor = None
def open_spider(self,spider):
self.conn = pymysql.connect(host="localhost", port=3306, database="qiubai",
user="root", password="123456", charset="utf8")
def process_item(self,item,spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into qiubai values("%s","%s")' % (item["auto"], item["cont"]))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
def close_spider(self,spider):
self.cursor.close()
self.conn.close()
- setting.py