scrapy是一个网页爬虫框架
安装scrapy 推荐使用Anaconda 安装
Anaconda 安装介绍 http://www.scrapyd.cn/doc/124.html
安装后需要配置 清华镜像
在 Anacoda prompt中 输入
1 conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
2 conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
3 conda config --set show_channel_urls yes
然后 在cmd 窗口输入
conda install scrapy
按提示 输入 y 开始下载完成后
输入 scrapy 查看 是否安装成功
安装 selenium 模块
conda install selenium
安装 xlutils 模块
conda install xlutils
下载chromedriver
http://npm.taobao.org/mirrors/chromedriver/
请根据自己的电脑上的 谷歌浏览器 版本 下载对应的chromedriver
创建scrapy 项目
scrapy startproject mingyan
mingyan 是 项目名称
在mingyan/spiders 下新建index.py 用于编写爬虫
1 import scrapy 2 import xlwt 3 import xlrd 4 5 # 全局变量 6 row = 0 #表格 行数 7 book = xlwt.Workbook()#新建一个excel 8 sheet = book.add_sheet('全国')#添加一个sheet页 9 sheet.write(row,0,"大学") 10 sheet.write(row,1,"图片地址") 11 sheet.write(row,2,"省份") 12 sheet.write(row,3,"大学类型") 13 sheet.write(row,4,"办学性质") 14 sheet.write(row,5,"所属单位") 15 sheet.write(row,6,"标签") 16 sheet.write(row,7,"大学详情网址") 17 18 class yzy(scrapy.Spider): #需要继承scrapy.Spider类 19 20 name = "yzy"#爬虫名称 21 22 # start_urls = [ # 另外一种写法,无需定义start_requests方法 23 # 'http://lab.scrapyd.cn/page/1/', 24 # ] 25 26 def start_requests(self): #定义方法爬取页面 27 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"} 28 # 设置cookie 29 cookies = { 30 'connect.sid':'s%3Ao5B5Jt2IE07URCM1yHnF7_qjEsx38tVP.CZ%2Fy0YUUNvuYoh0VaIUAusX8pCH8O%2BQZXC%2FUQ6O7OBw', 31 'UM_distinctid':'16c26c375e62c7-04af75133362ad-3e385f05-100200-16c26c375e7de', 32 'CNZZDATA1254568697':'902292085-1564015095-https%253A%252F%252Fwww.youzy.cn%252F%7C1564015095', 33 'Youzy2CCurrentProvince':'%7B%22provinceId%22%3A%22849%22%2C%22provinceName%22%3A%22%E6%B9%96%E5%8C%97%22%2C%22isGaokaoVersion%22%3Atrue%7D' 34 } 35 36 # url = 'https://www.youzy.cn/tzy/search/colleges/collegeList?page=1' 37 # yield scrapy.Request(url,headers=headers, cookies=cookies,callback=self.parse) 38 #已知有144页 range(1,145) 取值 1 - 144 39 for i in range(1, 145) : 40 url = 'https://www.youzy.cn/tzy/search/colleges/collegeList?page='+str(i) 41 yield scrapy.Request(url,headers=headers, cookies=cookies,callback=self.parse) 42 43 def parse(self,response): #定义parse 函数 系统自动给予 self 参数 response 爬取页面的返回数据 44 #声明全局变量 45 global row 46 global sheet 47 global book 48 li_box = response.css('.uzy-college-list li.clearfix') 49 for item in li_box: #循环数据集 50 imgSrc = item.css('.mark img::attr(src)').extract()[0] 51 colegeName = item.css('.info .top a::text').extract()[0] 52 classify = item.css(".info .bottom .quarter_1::text").extract()[0] 53 collegeType = item.css(".info .bottom .quarter_2::text").extract()[0] 54 industry = item.css(".info .bottom .quarter::text").extract()[0] 55 # 因为 有可能 没有所属地 导致 取数组的值 超出范围 改为 取最后一项 56 province = item.css(".info .bottom .quarter::text").extract()[-1] 57 tags = item.css(".info .bottom .college-types-txt::text").extract() 58 tags = ",".join(tags) 59 60 infoUrl = item.css('.mark a::attr(href)').extract()[0] 61 infoUrl = "https://www.youzy.cn"+infoUrl 62 63 # tags = item.css('.tags .tag::text').extract() 64 # tags = ",".join(tags) 65 # 写入 表格 66 row+=1 67 sheet.write(row,0,colegeName) 68 sheet.write(row,1,imgSrc) 69 sheet.write(row,2,province.strip()) 70 sheet.write(row,3,classify) 71 sheet.write(row,4,collegeType.strip()) 72 sheet.write(row,5,industry) 73 sheet.write(row,6,tags) 74 sheet.write(row,7,infoUrl) 84 book.save('yzy.xls') 85 # 找下一页的代码 不实用 因为是同步的 所以会导致 程序远行过慢 86 # next_page = response.css('li.next a::attr(href)').extract_first() 87 # if next_page is not None: 88 # print(next_page) 89 # next_page = response.urljoin(next_page) 90 # yield scrapy.Request(next_page, callback=self.parse)
关于 scrapy 的设置
使用chormedriver
myMiddleware.py 中间件
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.http import HtmlResponse
import time
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options = chrome_options ,executable_path='D:\\Anaconda3\\Scripts\\chromedriver.exe')
class javaScriptMiddleware(object):
def process_request(self,request,spider):
global driver
if spider.name == 'yzy_majorinfo_bk':
# driver = webdriver.Chrome('D:\\Anaconda3\\Scripts\\chromedriver.exe')
driver.get(request.url)
print(request.cookies)
# 写入cookie
# for key in request.cookies:
# # print(cookie)
# # tampOBJ = {}
# # tampOBJ[key] = request.cookies[key]
# # print(tampOBJ)
# driver.add_cookie({'name':key,'value':request.cookies[key]})
# driver.refresh()
# time.sleep(1)
# js="var q=document.documentElement.scrollTop=10000"
# driver.execute_script(js) #可执行js.模仿用户操作,此次是将页面拉至最底部
time.sleep(2)
body = driver.page_source
print("访问"+request.url)
return HtmlResponse(driver.current_url,body=body,encoding='utf-8',request=request)
elif spider.name == 'yzy_majorinfo_zk':
driver.get(request.url)
time.sleep(2)
body = driver.page_source
print("访问"+request.url)
return HtmlResponse(driver.current_url,body=body,encoding='utf-8',request=request)
else:
return None
完整代码 码云地址