- 创建一个scrapy项目
#在cmd中 依次输入
#scrapy startproject news
#cd news
#scrapy genspider -t crawl news163 news.163.com
- 在items.py文件里输入要爬取的内容
import scrapy
class NewsItem(scrapy.Item):
news_thread = scrapy.Field()
news_title = scrapy.Field()
news_time = scrapy.Field()
news_source = scrapy.Field()
source_url = scrapy.Field()
news_text = scrapy.Field()
news_url = scrapy.Field()
3.分析页面源代码并编写news163.py 文件
#导入需要的第三方库
import scrapy
from news.items import NewsItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule
#编写正则表达式
#https://news.163.com/20