Scrapy爬虫实例——南方都市报

1.目录结构

工程目录下: 忽略__pycache__目录(缓存目录),应该是类似java编译后的class文件。目录结构如下:
└─  SouthCity #工程(project)名
    │  
    │  scrapy.cfg              #scrapy爬虫部署的配置文件(新建项目的时候自动生成)
    │
    └─SouthCity #spider名
        │  items.py            #Items代码模块   
        │  middlewares.py      #Middlewares模块  
        │  pipelines.py        #Pipelines模块
        │  settings.py         #scrapy爬虫的配置文件(配置定制的scrapy组件)
        │  __init__.py
        │
        └─spiders              #spider模块目录
           │  mpage.py           #spider代码
           └─  __init__.py

2.爬虫简介

该爬虫主要分为3个部分:
自定义item(items.py ) scrapy内置的数据结构
爬取部分(mpage.py) 解析链接并存到item中
存储部分(pipelines.py) 从item中取出,存到数据库

3.自定义item

# -*- coding: utf-8 -*-
import scrapy

# 继承crapy.Item,
# 定义自己的数据结构
class ArticalItem(scrapy.Item):
    leading_title = scrapy.Field() #大标题  
    title = scrapy.Field() #标题
    subtitle = scrapy.Field() #副标题
    link = scrapy.Field() #链接  
    source = scrapy.Field() #新闻来源
    writeTime = scrapy.Field() #编写时间  
    section = scrapy.Field() #板块  
    author = scrapy.Field() #作者
    news =  scrapy.Field()  #新闻内容


修改Setting.py
ITEM_PIPELINES = {
  'SouthCity.pipelines.MySQLStoreCnblogsPipeline': 301,

}

4.爬取部分

#   mpage.py
#总共三个部分,也就是爬取的三个步骤。

#parse              拿到报纸所有的板块的url
#parse_section      拿到当前板块的所有新闻的url
#parse_page         取得对应的新闻信息,放到定义的item种

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from datetime import datetime

from SouthCity.items import ArticalItem
nav={}

class MpageSpider(scrapy.Spider):
    name = 'mpage'
    # allowed_domains = ['http://epaper.oeeee.com/epaper/A/html/']
    start_urls = ['http://epaper.oeeee.com/epaper']

    def parse(self, response):
        html = response.body
        soup=BeautifulSoup(html,'html.parser')
        paper_div=soup.find('div','shortcutbox')
        a=paper_div.find_all('a')

        for i in a:
            href=i.get('href')
            link=response.urljoin(href) #自动拼接
            # link='http://epaper.oeeee.com/epaper/'+href[href.find('A'):] (手动拼接相对连接)
            nav[i.text]=link
            try:
                #yield 的作用是一次提交一次请求后,继续执行。避免一次返回一个迭代对象,占用过多的内存。
                yield scrapy.Request(link,callback=self.parse_section)
            except:
                continue
        # print(nav)

    def parse_section(self, response):
        html = response.body
        soup=BeautifulSoup(html,'html.parser')
        paper_div=soup.find('div','main-list')
        a=paper_div.find_all('a')
        nav={}
        for i in a:
            href=i.get('href')
            link=response.urljoin(href)

            nav[i.text]=link
            try:
                yield scrapy.Request(link,callback=self.parse_page)
            except:
                continue

        # print(nav)
    def parse_page(self,response):
        detailbox=[]
        artical='  '

        html = response.body  
        soup = BeautifulSoup(html, "html.parser")  


        # try:
        info = soup.find('div', "main-600 fl")
        #print(1)
        detail=info.find_all('span')
        # detailbox.append(detail[1].text)
        #print(2)
        for dt in detail:
            try:
                dts=dt.text
                dts=dts[dts.find(':')+1:].strip()
                detailbox.append(dts)
            except:
                detailbox.append(dt.text)

        #print(3)
        news=info.find('div','text')
        pp=news.find_all('p')
        #print(4)
        for p in pp:
            pt = p.text
            pt = pt.strip().replace("\xa0","")
            artical += pt
        #print(5)
        try:
            head1=info.find('h1').text
            head2=info.find_all('h2')
        except:
            pass
        item = ArticalItem()  
        #print(6)
        item['leading_title'] = head2[0].text
        item['title'] = head1
        item['subtitle'] = head2[1].text
        item['link']=response.url
        item['writeTime']=detailbox[1]
        item['source']=detailbox[0]
        item['section']=detailbox[3]
        item['author']=detailbox[4]
        item['news']=artical
        #print(7)
        yield item
        return item

        # print(item)  

4.存储部分

# pipeline.py
#两部分:
# __init__          定义数据库的连接    
# process_item      利用定义好的连接进行存储


import pymysql
import logging
from hashlib import md5
import datetime
import sys


class MySQLStoreCnblogsPipeline(object):
    #定义一个变量,使得类中可以访问。就这点而言,相当于java的成员变量。

    def __init__(self):
        self.connect = pymysql.connect(
            host='localhost',
            db='TESTDB',
            user='pymysql',
            passwd='123123',
            charset='utf8',
            use_unicode=True)
        self.cursor = self.connect.cursor()

    def process_item(self, item, spider):
        global NewTable_Tag
        now = datetime.datetime.now()
        date=str(now.date())
        date_s=date[:4]+date[5:7]+date[8:]
        print(date_s) #日期序列 如 20170912
        table_name='sc_'+date_s #表名
        #建表的sql语句
        sql='CREATE TABLE SC_%s (leading_title varchar(255), title varchar(255), subtitle varchar(255), link varchar(250) NOT NULL primary key, writeTime varchar(20), source varchar(100),section varchar(50),author varchar(100),news text,updated datetime,img varchar(100))'%date_s

        #查询的sql语句
        sql_query = "SELECT 1 from SC_%s where link = '%s'"%(date_s,item['link'])

        #更新的sql语句
        sql_update = """UPDATE sc_%s set leading_title = '%s' ,
                                         title = '%s', 
                                         subtitle = '%s' ,
                                         link = '%s' , 
                                         writetime = '%s' , 
                                         source = '%s' ,
                                         section = '%s' , 
                                         author = '%s' ,
                                         news = '%s' ,
                                         updated = '%s' 
                                    where link = '%s'
                    """% (date_s,
                          item['leading_title'],
                          item['title'],
                          item['subtitle'],
                          item['link'],
                          item['writeTime'],
                          item['source'],
                          item['section'],
                          item['author'],
                          item['news'],
                          now,
                          item['link'])

        #插入新值的sql语句
        sql_insert =  """
                    insert into sc_%s(leading_title, title, subtitle, link, writeTime,source,section,author,news,updated) 
                    values('%s', '%s', '%s','%s', '%s', '%s', '%s', '%s', '%s', '%s')
                    """% (date_s,
                        item['leading_title'],
                        item['title'],
                        item['subtitle'],
                        item['link'],
                        item['writeTime'],
                        item['source'],
                        item['section'],
                        item['author'],
                        item['news'],
                        now)
        print('我在loc1')


                    #暂时忽略:tag 如何不重复检查?
        #检查是否存在该表
        self.cursor.execute('show tables')
        tables=self.cursor.fetchall()

        if  (table_name,) not in tables:
          try:
              #不存在,新建表
              self.cursor.execute(sql)
          except Exception as e:
              raise e
        print('我在loc2') 


        try:
            #查询当前链接item['link']是否存在
            self.cursor.execute(sql_query)
            ret = self.cursor.fetchone()
            if ret:
                self.cursor.execute(sql_update)
                print("成功更新一条数据!")
                #print """
                #    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
                #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)


            #item['link']不存在   
            else:
                self.cursor.execute(sql_insert)
                print("成功插入一条数据!")
                #print """
                #    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
                #    values(%s, %s, %s, %s, %s, %s)
                #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
            print('我在loc3')
            self.connect.commit()
            # self.cursor.close() # 关闭游标,似乎没有关闭连接(不知会有什么影响,实际使用没发觉有问题)

        except Exception as error:
            logging.warning(error)
        return item



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值