scrapy爬虫框架
目录
1、流程
(1)创建项目
(2)自动打开浏览器页面
(3)表单登录
(4)滑块验证
(5)获取cookie
(6)爬取数据
(7)保存json文件
2、整体框架
1、流程
(1)创建项目
d:
cd d:\scrapyProject
scrapy startproject xxx_yyy
cd d:\scrapyProject\xxx_yyy
scrapy genspider hot https://www.xxx
(2)自动打开浏览器页面
class HotSalesSpider(Spider):
# 定义爬虫名称
name = 'hot'
current_page = 1 # 设置当前页,起始为1
driver = webdriver.Chrome("C:\chromedriver.exe")
# 登录入口
url = "https://www.qidian.com/rank/hotsales?style=1"
driver.get(url)
time.sleep(1)
(3)表单登录
# 需要选择 “密码登录”
driver.find_element_by_xpath('//*[@id="normalLogin"]').click()
time.sleep(1)
# 手机号、密码登录
# 选中节点
input_mobile = driver.find_element_by_xpath('//*[@id="nameNormal"]')
input_Pwd = driver.find_element_by_xpath('//*[@id="pwdNormal"]')
# 输入手机号、密码
input_mobile.send_keys('133')
input_Pwd.send_keys('pwd')
(4)滑块验证
# 选中滑块
huakuai = driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
# 拖动滑块
action = ActionChains(driver)
# step1:在滑块处按住鼠标左键
action.click_and_hold(huakuai)
# step2:相对鼠标当前位置进行移动
action.move_by_offset(500, 0) # 500值是试出来的
# step3:释放鼠标
action.release()
# 执行动作
action.perform()
(5)获取cookie
class qidianSpider(Spider):
name = 'bookshelf' # 爬虫名称
#获取cookie
def __init__(self):
cookiejar = browsercookie.chrome() # 获取Chrome浏览器中的Cookie
self.cookie_dict = {
} # 字典:保存起点中文网的Cookie
# 遍历Chrome中所有的Cookie,获取起点中文网的Cookie
for cookie in cookiejar:
if cookie.domain == ".qidian.com": # 域名为起点中文网
if cookie.name in ["_csrfToken",
"e1",
"e2",
"newstatisticUUID",
"ywguid",
"ywkey"]:
self.cookie_dict[cookie.name] = cookie.value
# 初始请求函数
def start_requests(self):
url = "https://my.qidian.com/bookcase" # 初始网址
yield Request(url