requests和scrapy请求同样的页面接口,返回的数据不一样问题

1.请求接口:

在这里插入图片描述
在这里插入图片描述

2.requests请求:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import time

import requests


class REWQ:
    def __init__(self):
        self.url = 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp'
        self.headers = {
            'Host': 'www.dqzsteel.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'Origin': 'http://www.dqzsteel.com',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer': 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cookie': 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=1657870616',
        }

    def test(self):
        timetemp = int(time.time())
        self.headers[
            'Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s' % (
            str(timetemp))
        page = 1
        req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=924&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s' % (
            page)
        res = requests.post(url=self.url,data=req_body,headers=self.headers)
        print(res.text)
        pass

if __name__ == '__main__':
    ob = REWQ()
    ob.test()

成功响应
在这里插入图片描述

3.scrapy请求

import json
import time
from urllib.parse import urlencode

import scrapy

from utils.robot import send_msg


class SalesmaninfospiderSpider(scrapy.Spider):
    name = 'SalesmanInfoSpider'
    
    def __init__(self):
        self.url = 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp'
        self.headers = {
            'Host': 'www.dqzsteel.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'Origin': 'http://www.dqzsteel.com',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer': 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cookie': 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=1657870616',
        }

    def start_requests(self):
        timetemp = int(time.time())
        self.headers['Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s'%(str(timetemp))
        page = 1
        
        req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=924&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s'%(page)
        yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers,#cookies=cookies,
                            meta={"page": page},
                             callback=self.get_product, dont_filter=True)
    

    def get_product(self, response):
        """
        获取产品id
        """

        page = response.meta.get("page")
        try:
            print(response.request.headers)
            res_data = response.text
            print(res_data)
            pass

        except Exception as e:
            send_msg(e)
            pass

4.请求成功,但是返回的参数不一样

在这里插入图片描述
在这里插入图片描述
scrapy传递cookie的参数与requests的不一样,改为以下即可:

cookies = {i.split("=")[0]: i.split("=")[1] for i in str(self.headers['Cookie']).split("; ")}
yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers,cookies=cookies,
                    meta={"page": page},
                     callback=self.get_product, dont_filter=True)

成功响应一样的页面:
在这里插入图片描述

5.改正后,scrapy完整代码:

import os
import time

import scrapy
import pandas as pd
from utils.robot import send_msg


class SalesmaninfospiderSpider(scrapy.Spider):
    name = 'SalesmanInfoSpider'

    def __init__(self):
        self.url = 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp'
        self.headers = {
            'Host': 'www.dqzsteel.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'Origin': 'http://www.dqzsteel.com',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Referer': 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cookie': 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=1657870616',
        }

    def start_requests(self):
        timetemp = int(time.time())
        self.headers['Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s'%(str(timetemp))
        page = 1
        req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=927&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s'%(page)
        cookies = {i.split("=")[0]: i.split("=")[1] for i in str(self.headers['Cookie']).split("; ")}
        yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers,cookies=cookies,
                            meta={"page": page},
                             callback=self.get_product, dont_filter=True)


    def get_product(self, response):
        """
        获取产品id
        """

        page = response.meta.get("page")
        print("正在爬取第",str(page),"页!")
        try:
            # print(response.request.headers)
            res_data = response.text
            tables = pd.read_html(res_data.replace('<br>','  '))
            # print(tables)
            data_len = len(tables)
            if data_len > 1:
                df1 = tables[0]
                # 删除n列
                df1 = df1.drop([0, 2, 4, 6, 7, 8, 9, 10], axis=1)
                # 重建索引
                df1.reset_index(drop=True)

                path = os.path.join(os.getcwd(), 'excel',  "1.csv")
                # 删除全部为空的行(只要有空值就删除)
                df1 = df1.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) 
                print(df1)
                df1.to_csv(path, mode='a')
            #下一页
            if page < 10:
                page += 1
                timetemp = int(time.time())
                self.headers[
                    'Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s' % (
                    str(timetemp))
                req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=927&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s' % (
                    page)
                cookies = {i.split("=")[0]: i.split("=")[1] for i in str(self.headers['Cookie']).split("; ")}
                yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers, cookies=cookies,
                                     meta={"page": page},
                                     callback=self.get_product, dont_filter=True)
            else:
                print("爬取完成!")

        except Exception as e:
            send_msg(e)
            pass

6.成功爬取,效果如下:

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

码农螺丝钉

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值