scrapy框架爬信息存入Mysql

最新推荐文章于 2024-07-27 12:20:46 发布

2201_75465478

最新推荐文章于 2024-07-27 12:20:46 发布

阅读量296

点赞数

文章标签： scrapy python

本文链接：https://blog.csdn.net/2201_75465478/article/details/128321046

版权

1.命令提示符页面(cmd)

scrapy startproject yaowen

cd yaowen 

scrapy genspider yw www.gov.cn

2.item.py

import scrapy
class YaowenItem(scrapy.Item):
    title=scrapy.Field()
    date=scrapy.Field()
    url=scrapy.Field()
    neirong=scrapy.Field()

3.yw.py

import scrapy
import requests
from yaowen.items import YaowenItem
from urllib import parse
class YwSpider(scrapy.Spider):
    name = 'yw'
    allowed_domains = ["www.gov.cn"]
    start_urls = ['http://www.gov.cn/xinwen/']

    def parse(self, response):
        total=response.xpath('//div[@class="zl_channel_body zl_channel_bodyxw"]/dl')
        for b in total:
            item=YaowenItem()
            title1 = b.xpath('./dd/h4/a/text()').extract()
            date1=b.xpath('./dd/h4/span/text()').extract()
            new_url1=b.xpath('./dd/h4/a/@href').extract()
            neirong1=b.xpath('./dd/p/text()').extract()
            # print("########################################################")
            # print(new_url1)
            page_url='http://gov.cn'
            new_url="".join(new_url1)
            title=list(map(str,title1))
            date=list(map(str,date1))
            neirong=list(map(str,neirong1))
            title=' '.join(title)
            date=' '.join(date)
            neirong=' '.join(neirong)

            # print(page_url,type(page_url),type(new_url))
            new_full_url=parse.urljoin(page_url,new_url)
            item['title']=title
            item['date']=date
            item['url']=new_full_url
            item['neirong']=neirong
            # print("****************************************************************")
            # print(item['title'])
            # print(item['url'])
            # print(item['date'])
            # print(item['neirong'])

            yield item

    def get_content(self,url):
        header={
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
        }
        cont=requests.get(url,headers=header)
        content=cont.content.decode("gb2312",errors='ignore')
        return content

4.pipelines.py，存入MySQL

from itemadapter import ItemAdapter
import pymysql
#存入MySQL
class MysqlPipeline(object):
    def __init__(self):
        self.conn=pymysql.connect(host='localhost',user='root',password='zhangrui2580456',database='shiyanzuoye',port=3306,charset='utf8')
        self.cursor=self.conn.cursor()#游标对象
    def process_item(self,item,spider):
        self.cursor.execute('insert into zuoyeTable(title,date,url,neirong) VALUES ("{}","{}","{}","{}")'.format(item['title'],item['date'],item['url'],item['neirong']))
        self.conn.commit()

5.settings.py

BOT_NAME = 'yaowen'

SPIDER_MODULES = ['yaowen.spiders']
NEWSPIDER_MODULE = 'yaowen.spiders'


COOKIES_ENABLED = False



ITEM_PIPELINES = {
   'yaowen.pipelines.MysqlPipeline': 300,
   'yaowen.pipelines.MongodbPipeline':400

6.start.py(启动程序，在yaowen目录下新建start.py)

from scrapy import cmdline
def main():
    scrapy.cmdline.execute(["scrapy","crawl","yw"])
    # cmdline.excute("xcrapy crawl ")
if __name__=='__main__':
    main()

7.结果展示