1.命令提示符页面(cmd)
scrapy startproject yaowen
cd yaowen
scrapy genspider yw www.gov.cn
2.item.py
import scrapy
class YaowenItem(scrapy.Item):
title=scrapy.Field()
date=scrapy.Field()
url=scrapy.Field()
neirong=scrapy.Field()
3.yw.py
import scrapy
import requests
from yaowen.items import YaowenItem
from urllib import parse
class YwSpider(scrapy.Spider):
name = 'yw'
allowed_domains = ["www.gov.cn"]
start_urls = ['http://www.gov.cn/xinwen/']
def parse(self, response):
total=response.xpath('//div[@class="zl_channel_body zl_channel_bodyxw"]/dl')
for b in total:
item=YaowenItem()
title1 = b.xpath('./dd/h4/a/text()').extract()
date1=b.xpath('./dd/h4/span/text()').extract()
new_url1=b.xpath('./dd/h4/a/@href').extract()
neirong1=b.xpath('./dd/p/text()').extract()
# print("########################################################")
# print(new_url1)
page_url='http://gov.cn'
new_url="".join(new_url1)
title=list(map(str,title1))
date=list(map(str,date1))
neirong=list(map(str,neirong1))
title=' '.join(title)
date=' '.join(date)
neirong=' '.join(neirong)
# print(page_url,type(page_url),type(new_url))
new_full_url=parse.urljoin(page_url,new_url)
item['title']=title
item['date']=date
item['url']=new_full_url
item['neirong']=neirong
# print("****************************************************************")
# print(item['title'])
# print(item['url'])
# print(item['date'])
# print(item['neirong'])
yield item
def get_content(self,url):
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}
cont=requests.get(url,headers=header)
content=cont.content.decode("gb2312",errors='ignore')
return content
4.pipelines.py,存入MySQL
from itemadapter import ItemAdapter
import pymysql
#存入MySQL
class MysqlPipeline(object):
def __init__(self):
self.conn=pymysql.connect(host='localhost',user='root',password='zhangrui2580456',database='shiyanzuoye',port=3306,charset='utf8')
self.cursor=self.conn.cursor()#游标对象
def process_item(self,item,spider):
self.cursor.execute('insert into zuoyeTable(title,date,url,neirong) VALUES ("{}","{}","{}","{}")'.format(item['title'],item['date'],item['url'],item['neirong']))
self.conn.commit()
5.settings.py
BOT_NAME = 'yaowen'
SPIDER_MODULES = ['yaowen.spiders']
NEWSPIDER_MODULE = 'yaowen.spiders'
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'yaowen.pipelines.MysqlPipeline': 300,
'yaowen.pipelines.MongodbPipeline':400
6.start.py(启动程序,在yaowen目录下新建start.py)
from scrapy import cmdline
def main():
scrapy.cmdline.execute(["scrapy","crawl","yw"])
# cmdline.excute("xcrapy crawl ")
if __name__=='__main__':
main()
7.结果展示
8.关于数据存入mongodb,爬数据时候的翻页爬取操作将以“论文发表网”为例发出