练习 网易新闻排行榜

# -*- coding: utf-8 -*-
import scrapy
from lxml import etree
from hw_project.items import NewsItem

class WangyiSpider(scrapy.Spider):
    name = 'wangyi'
    allowed_domains = ['news.163.com']
    start_urls = ['http://news.163.com/special/0001386F/rank_whole.html']

    def parse(self, response):
        # print(response.text)
        html = response.text
        html_ele = etree.HTML(html)
        div_total_list = html_ele.xpath('/html/body/div[4]/div')[1:3]
        # print(div_total_list)
        for div_total in div_total_list:
            div_table_list = div_total.xpath('./div/div')[1:4]
            for div_table in div_table_list:
                tr_list = div_table.xpath('./table/tr')[1:]
                # print(tr_list)
                for tr in tr_list:
                    try:
                        title = tr.xpath('./td[1]/a')[0].text
                        num_click = tr.xpath('./td[2]')[0].text
                        # print(title,num_click)
                        item = NewsItem()
                        item['title'] = title
                        item['num_click'] = num_click
                        # print(title,num_click)
                        yield  item
                    except:
                        continue
class NewsItem(scrapy.Item):
    title = scrapy.Field()
    num_click = scrapy.Field()
class NewsPipeline(object):
    def __init__(self):
        self.mysql_object = MysqlHelper()

    def process_item(self, item, spider):
        sql = 'insert into data_news (title, num_click) values (%s, %s)'
        data = (item['title'], item['num_click'])
        # print(data)
        self.mysql_object.execute_modify_sql(sql, data)
execute('scrapy crawl wangyi'.split())
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值