scrapy框架爬信息存入Mysql

1.命令提示符页面(cmd)

scrapy startproject yaowen

cd yaowen 

scrapy genspider yw www.gov.cn 

2.item.py

import scrapy
class YaowenItem(scrapy.Item):
    title=scrapy.Field()
    date=scrapy.Field()
    url=scrapy.Field()
    neirong=scrapy.Field()

3.yw.py

import scrapy
import requests
from yaowen.items import YaowenItem
from urllib import parse
class YwSpider(scrapy.Spider):
    name = 'yw'
    allowed_domains = ["www.gov.cn"]
    start_urls = ['http://www.gov.cn/xinwen/']

    def parse(self, response):
        total=response.xpath('//div[@class="zl_channel_body zl_channel_bodyxw"]/dl')
        for b in total:
            item=YaowenItem()
            title1 = b.xpath('./dd/h4/a/text()').extract()
            date1=b.xpath('./dd/h4/span/text()').extract()
            new_url1=b.xpath('./dd/h4/a/@href').extract()
            neirong1=b.xpath('./dd/p/text()').extract()
            # print("########################################################")
            # print(new_url1)
            page_url='http://gov.cn'
            new_url="".join(new_url1)
            title=list(map(str,title1))
            date=list(map(str,date1))
            neirong=list(map(str,neirong1))
            title=' '.join(title)
            date=' '.join(date)
            neirong=' '.join(neirong)

            # print(page_url,type(page_url),type(new_url))
            new_full_url=parse.urljoin(page_url,new_url)
            item['title']=title
            item['date']=date
            item['url']=new_full_url
            item['neirong']=neirong
            # print("****************************************************************")
            # print(item['title'])
            # print(item['url'])
            # print(item['date'])
            # print(item['neirong'])

            yield item

    def get_content(self,url):
        header={
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
        }
        cont=requests.get(url,headers=header)
        content=cont.content.decode("gb2312",errors='ignore')
        return content

4.pipelines.py,存入MySQL

from itemadapter import ItemAdapter
import pymysql
#存入MySQL
class MysqlPipeline(object):
    def __init__(self):
        self.conn=pymysql.connect(host='localhost',user='root',password='zhangrui2580456',database='shiyanzuoye',port=3306,charset='utf8')
        self.cursor=self.conn.cursor()#游标对象
    def process_item(self,item,spider):
        self.cursor.execute('insert into zuoyeTable(title,date,url,neirong) VALUES ("{}","{}","{}","{}")'.format(item['title'],item['date'],item['url'],item['neirong']))
        self.conn.commit()

5.settings.py

BOT_NAME = 'yaowen'

SPIDER_MODULES = ['yaowen.spiders']
NEWSPIDER_MODULE = 'yaowen.spiders'


COOKIES_ENABLED = False



ITEM_PIPELINES = {
   'yaowen.pipelines.MysqlPipeline': 300,
   'yaowen.pipelines.MongodbPipeline':400

6.start.py(启动程序,在yaowen目录下新建start.py)

from scrapy import cmdline
def main():
    scrapy.cmdline.execute(["scrapy","crawl","yw"])
    # cmdline.excute("xcrapy crawl ")
if __name__=='__main__':
    main()

7.结果展示

8.关于数据存入mongodb,爬数据时候的翻页爬取操作将以“论文发表网”为例发出

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值