Node.js HTTP requests with gzip/deflate compression

最新推荐文章于 2024-02-19 17:11:57 发布

turkeyzhou

最新推荐文章于 2024-02-19 17:11:57 发布

阅读量4.5k

点赞数

分类专栏： nodejs

nodejs 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

One of my recent projects involved scraping some web data for offline processing. I started using the excellent request library by Mikeal Rogers, which has a number of nice and convenient improvements over the default Node http library.

As I unleashed my first prototype on the web, the database started growing much faster than I had planned. I started by storing raw and uncompressed response data, so an immediate optimization was to use the Accept-Encoding HTTP request header to fetch compressed data from the server.

Unfortunately, some of my target servers sometimes sent back uncompressed data (which they’re entitled to do under the HTTP spec, it’s just slightly annoying). I needed a way to conditionally handle compressed data based on the Content-Encoding response header. I founda solution that worked with the default Node.js HTTP library, but it wasn’t immediately obvious how to port that to Mikeal’s request library.

Approach 1: no streams

My first solution collected data chunks into a Buffer, then passed that into the relevant zlib functions if needed. It’s more code than I wanted, but it works well.

Note: for simplicity, I’ve left out the logic that writes the compressed response body to the database.

 
           var 
           
           request 
           
           = 
           
           require
           
           (
           
           'request'
           
           ),
          
           zlib 
           
           = 
           
           require
           
           (
           
           'zlib'
           
           );
          
           var 
           
           headers 
           
           = 
           
           {
          
           "accept-charset" 
           
           : 
           
           "ISO-8859-1,utf-8;q=0.7,*;q=0.3"
           
           ,
          
           "accept-language" 
           
           : 
           
           "en-US,en;q=0.8"
           
           ,
          
           "accept" 
           
           : 
           
           "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
           
           ,
          
           "user-agent" 
           
           : 
           
           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
           
           ,
          
           "accept-encoding" 
           
           : 
           
           "gzip,deflate"
           
           ,
          
           };
          
           var 
           
           options 
           
           = 
           
           {
          
           url
           
           : 
           
           "http://google.com"
           
           ,
          
           headers
           
           : 
           
           headers
          
           };
          
           var 
           
           requestWithEncoding 
           
           = 
           
           function
           
           (
           
           options
           
           , 
           
           callback
           
           ) 
           
           {
          
           var 
           
           req 
           
           = 
           
           request
           
           .
           
           get
           
           (
           
           options
           
           );
          
           req
           
           .
           
           on
           
           (
           
           'response'
           
           , 
           
           function
           
           (
           
           res
           
           ) 
           
           {
          
           var 
           
           chunks 
           
           = 
           
           [];
          
           res
           
           .
           
           on
           
           (
           
           'data'
           
           , 
           
           function
           
           (
           
           chunk
           
           ) 
           
           {
          
           chunks
           
           .
           
           push
           
           (
           
           chunk
           
           );
          
           });
          
           res
           
           .
           
           on
           
           (
           
           'end'
           
           , 
           
           function
           
           () 
           
           {
          
           var 
           
           buffer 
           
           = 
           
           Buffer
           
           .
           
           concat
           
           (
           
           chunks
           
           );
          
           var 
           
           encoding 
           
           = 
           
           res
           
           .
           
           headers
           
           [
           
           'content-encoding'
           
           ];
          
           if 
           
           (
           
           encoding 
           
           == 
           
           'gzip'
           
           ) 
           
           {
          
           zlib
           
           .
           
           gunzip
           
           (
           
           buffer
           
           , 
           
           function
           
           (
           
           err
           
           , 
           
           decoded
           
           ) 
           
           {
          
           callback
           
           (
           
           err
           
           , 
           
           decoded 
           
           && 
           
           decoded
           
           .
           
           toString
           
           ());
          
           });
          
           } 
           
           else 
           
           if 
           
           (
           
           encoding 
           
           == 
           
           'deflate'
           
           ) 
           
           {
          
           zlib
           
           .
           
           inflate
           
           (
           
           buffer
           
           , 
           
           function
           
           (
           
           err
           
           , 
           
           decoded
           
           ) 
           
           {
          
           callback
           
           (
           
           err
           
           , 
           
           decoded 
           
           && 
           
           decoded
           
           .
           
           toString
           
           ());
          
           })
          
           } 
           
           else 
           
           {
          
           callback
           
           (
           
           null
           
           , 
           
           buffer
           
           .
           
           toString
           
           ());
          
           }
          
           });
          
           });
          
           req
           
           .
           
           on
           
           (
           
           'error'
           
           , 
           
           function
           
           (
           
           err
           
           ) 
           
           {
          
           callback
           
           (
           
           err
           
           );
          
           });
          
           }
          
           requestWithEncoding
           
           (
           
           options
           
           , 
           
           function
           
           (
           
           err
           
           , 
           
           data
           
           ) 
           
           {
          
           if 
           
           (
           
           err
           
           ) 
           
           console
           
           .
           
           log
           
           (
           
           err
           
           );
          
           else 
           
           console
           
           .
           
           log
           
           (
           
           data
           
           );
          
           })
          
    view raw 
    gzipRequestTest.js hosted with ❤ by  
    GitHub

Approach 2: streams

The downside to the first approach is that all response data is buffered in memory. This was fine for my use case, but in general this can cause memory issues if you’re scraping websites with really large response bodies.

A better approach is to use streams, as Mikeal suggested. Streams are a wonderful abstraction that can help you manage memory consumption better, among other things. There are two great introductions to Node streams here and here. Keep in mind that streams in Node.js are somewhat intricate and still evolving (for example, Node 0.10 introduced streams2 which is not entirely backwards compatible with older versions of Node).

Here’s a working solution that pipes response data into a zlib stream, then pipes that into a final destination (a file, in this case). Notice that the code is cleaner and more readable.

 
           var 
           
           request 
           
           = 
           
           require
           
           (
           
           'request'
           
           ),
          
           zlib 
           
           = 
           
           require
           
           (
           
           'zlib'
           
           ),
          
           fs 
           
           = 
           
           require
           
           (
           
           'fs'
           
           );
          
           var 
           
           headers 
           
           = 
           
           {
          
           "accept-charset" 
           
           : 
           
           "ISO-8859-1,utf-8;q=0.7,*;q=0.3"
           
           ,
          
           "accept-language" 
           
           : 
           
           "en-US,en;q=0.8"
           
           ,
          
           "accept" 
           
           : 
           
           "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
           
           ,
          
           "user-agent" 
           
           : 
           
           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
           
           ,
          
           "accept-encoding" 
           
           : 
           
           "gzip,deflate"
           
           ,
          
           }
          
           var 
           
           options 
           
           = 
           
           {
          
           url
           
           : 
           
           "http://google.com"
           
           ,
          
           headers
           
           : 
           
           headers
          
           }
          
           var 
           
           compressedRequest 
           
           = 
           
           function
           
           (
           
           options
           
           , 
           
           outStream
           
           ) 
           
           {
          
           var 
           
           req 
           
           = 
           
           request
           
           (
           
           options
           
           )
          
           req
           
           .
           
           on
           
           (
           
           'response'
           
           , 
           
           function 
           
           (
           
           res
           
           ) 
           
           {
          
           if 
           
           (
           
           res
           
           .
           
           statusCode 
           
           !== 
           
           200
           
           ) 
           
           throw 
           
           new 
           
           Error
           
           (
           
           'Status not 200'
           
           )
          
           var 
           
           encoding 
           
           = 
           
           res
           
           .
           
           headers
           
           [
           
           'content-encoding'
           
           ]
          
           if 
           
           (
           
           encoding 
           
           == 
           
           'gzip'
           
           ) 
           
           {
          
           res
           
           .
           
           pipe
           
           (
           
           zlib
           
           .
           
           createGunzip
           
           ()).
           
           pipe
           
           (
           
           outStream
           
           )
          
           } 
           
           else 
           
           if 
           
           (
           
           encoding 
           
           == 
           
           'deflate'
           
           ) 
           
           {
          
           res
           
           .
           
           pipe
           
           (
           
           zlib
           
           .
           
           createInflate
           
           ()).
           
           pipe
           
           (
           
           outStream
           
           )
          
           } 
           
           else 
           
           {
          
           res
           
           .
           
           pipe
           
           (
           
           outStream
           
           )
          
           }
          
           })
          
           req
           
           .
           
           on
           
           (
           
           'error'
           
           , 
           
           function
           
           (
           
           err
           
           ) 
           
           {
          
           throw 
           
           err
           
           ;
          
           })
          
           }
          
           // Dummy write stream. Substitute with any other writeable stream
          
           var 
           
           outStream 
           
           = 
           
           fs
           
           .
           
           createWriteStream
           
           (
           
           './sample.html'
           
           )
          
           compressedRequest
           
           (
           
           options
           
           , 
           
           outStream
           
           )
          
           request({
    url:"http://img.baidu.com/hunter/alog/alog.min.js",
    encoding:'utf-8',
    proxy:'http://web-proxy.oa.com:8080'
}).pipe(fs.createWriteStream('data_111.gz'));

    view raw 
    gzipRequestStreams.js hosted with ❤ by  
    GitHub

Summary

Both of those approaches will get the job done with Mikeal’s library, and the one you choose depends on the use case. In my project, I needed to save the compressed response data as a field of a Mongoose document, then further process the decompressed data. Streams don’t suit this use case well, so I used the first approach.

turkeyzhou

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Node.js HTTP requests with gzip/deflate compression

One of my recent projects involved scraping some web data for offline processing. I started using the excellent request library by Mikeal Rogers, which has a number of nice and convenient improvemen
复制链接

扫一扫

专栏目录