import scrapy
from lxml import etree
from hw_project.items import NewsItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
allowed_domains = ['news.163.com']
start_urls = ['http://news.163.com/special/0001386F/rank_whole.html']
def parse(self, response):
html = response.text
html_ele = etree.HTML(html)
div_total_list = html_ele.xpath('/html/body/div[4]/div')[1:3]
for div_total in div_total_list:
div_table_list = div_total.xpath('./div/div')[1:4]
for div_table in div_table_list:
tr_list = div_table.xpath('./table/tr')[1:]
for tr in tr_list:
try:
title = tr.xpath('./td[1]/a')[0].text
num_click = tr.xpath('./td[2]')[0].text
item = NewsItem()
item['title'] = title
item['num_click'] = num_click
yield item
except:
continue
class NewsItem(scrapy.Item):
title = scrapy.Field()
num_click = scrapy.Field()
class NewsPipeline(object):
def __init__(self):
self.mysql_object = MysqlHelper()
def process_item(self, item, spider):
sql = 'insert into data_news (title, num_click) values (%s, %s)'
data = (item['title'], item['num_click'])
self.mysql_object.execute_modify_sql(sql, data)
execute('scrapy crawl wangyi'.split())