# -*- coding: utf-8 -*-
import scrapy
import pymysql
import datetime
import re
from scrapy.selector import Selector
class JiaoyizheSpider(scrapy.Spider):
name = 'jiaoyizhe'
start_urls = ['http://www.jiaoyizhe.com/forum-16-1.html']
today = datetime.date.today()
oneday = datetime.timedelta(days=1)
twoday = datetime.timedelta(days = 2)
yesterday = today -oneday
before_yesterday = today -twoday
page = 1
def parse(self, response):
papers = response.xpath(r"//table[@id='threadlisttableid']//tbody")
all_page = response.xpath(r'//span[@id="fd_page_top"]//label/span/text()').extract()[0]
all_page = re.findall(r'\d+', all_page)[0]
for paper in papers:
title = paper.xpath(r"tr/th/a[2]/text()").extract()
title_link = paper.xpath(r"tr/th/a[2]/@href").extract()
click = paper.xpath(r"tr/td[3]/a/text()").extract()
reply = paper.xpath(r"tr/td[3]/em/text()").extract()
time = paper.xpath(r"tr/td[2]//span/text()").extract()
if len(title) != 0:
title = title[0]
if len(title_link) != 0:
title_link = title_link[0]
else:
continue
if len(click) != 0:
click = click[0]
if len(reply) != 0:
reply = reply[0]
if len(time) != 0:
time = time[0].replace(u'\xa0 ', u' ')
times = re.findall(r"昨天|前天", time, re.S)
if len(times)!=0:
if times[0]=="昨天":
time = self.yesterday
if times[0] == "前天":
time = self.before_yesterday
'''
关于日期只需要统计昨天 前天 用正则来匹配这两个字 如果能匹配到 获取今天日期 减去就好了 插入数据库
先将每天的数据存取下来 然后对每天的数据进行分词 统计单个词语出现次数
统计淘股吧
'''
item = scrapy.Request(url=title_link, callback=self.paper_item)
item.meta['title'] = title
item.meta['title_link'] = title_link
item.meta['click'] = click
item.meta['reply'] = reply
item.meta['time'] = time
yield item
if self.page <= int(all_page):
self.page += 1
url = 'http://www.jiaoyizhe.com/forum-16-'+ str(self.page)+'.html'
yield scrapy.Request(url=url,callback = self.parse)
def paper_item(self, response):
text_paper = response.xpath(
r'//div[@class="t_fsz"]//td//text()').extract()
if len(text_paper) != 0:
text_paper = text_paper[0]
title = response.meta['title']
title_link = response.meta['title_link']
click = response.meta['click']
reply = response.meta['reply']
time = response.meta['time']
抓取投机岛期货论坛 并写入数据库
最新推荐文章于 2024-01-05 14:35:48 发布