爬虫中多线程和断点续传的实现

本文探讨了如何在获取下载链接后提高下载速度和实现断点续传功能。关键在于利用HttpWebRequest的Range属性设定下载范围,并通过多线程下载不同部分。作者提供了一段C#代码示例,展示了如何创建和配置HttpWebRequest以实现这一目标。此外,还提到了断点续传的实现,即保存已下载部分的大小,下次从该位置继续下载。
摘要由CSDN通过智能技术生成

怎么获取下载链接和构造Request 请求不是本文的重点,本文重点是获取下载链接后怎么提高下载速度以及断电续传功能的实现思路。
获取下载链接后可以从Response 中获得所需文档的大小,即contentlength 的值,而Request 中range 属性可以设置下载的范围,知道了这个原理,多线程和断点续传就不难实现了。笔者的一段实现多线程下载的代码如下:

     /// <summary>
        ///  with parm from and to control the start and end download postion of the stream which can download the stream with multithread and even
        ///  implement the continious download after breaking.
        /// </summary>
        /// <param name="url"></param>
        /// <param name="from"> the start download position of the stream</param>
        /// <param name="to">The end download position of the stream</param>
        /// <returns></returns>

        public static HttpWebRequest getBlblHttpReq3(String url, long from, long to)
        {

            string headers = $@"GET /upgcxcode/49/91/252239149/252239149-1-416.mp4?e=ig8euxZM2rNcNbh17WdVhoMzhWUVhwdEto8g5X10ugNcXBlqNxHxNEVE5XREto8KqJZHUa6m5J0SqE85tZvEuENvNo8g2ENvNo8i8o859r1qXg8xNEVE5XREto8GuFGv2U7SuxI72X6fTr859r1qXg8gNEVE5XREto8z5JZC2X2gkX5L5F1eTX1jkXlsTXHeux_f2o859IB_&uipk=5&nbs=1&deadline=1608147118&gen=playurl&os=bcache&oi=1898897508&trid=e78bffb50fe34dbcbfa1a64f94ef5e5dp&platform=pc&upsig=2c86c8b3551699a6761bf640d27261d3&uparams=e,uipk,nbs,deadline,gen,os,oi,trid,platform&cdnid=3230&mid=598717778&orderid=0,3&agrr=0&logo=80000000 HTTP/1.1
Host: cn-bj3-cc-bcache-11.bilivideo.com
Connection: keep-alive
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36
Accept-Encoding: identity;q=1, *;q=0
Accept: */*
Sec-Fetch-Site: cross-site
Sec-Fetch-Mode: no-cors
Sec-Fetch-Dest: video
Referer: https://www.bilibili.com/
Accept-Language: zh-CN,zh;q=0.9
Range: bytes=0-
";
            String pattern = "(GET (?<url>.*)\n)(Host:(?<host>.*))\n(Connection:(?<Connection>.*))\n(User-Agent: (?<User-Agent>.*))\n(Accept:(?<Accept>.*))\n(Origin:(?<Origin>.*))\n(Sec-Fetch-Site: (?<Sec-Fetch-Site>.*))\n(Sec-Fetch-Mode: (?<Sec-Fetch-Mode>.*))\n(Sec-Fetch-Dest: (?<Sec-Fetch-Dest>.*))\n(Referer: (?<Referer>.*))\n(Accept-Encoding: (?<Accept-Encoding>.*)\n(Accept-Language: (?<Accept-Language>.*))\n(Range: (?<Range>.*))\n(If-Range: (?<If-Range>.*))";
            pattern = "(GET (?<url>.*)\n)(Host:(?<host>.*))\n(Connection:(?<Connection>.*))\n(User-Agent: (?<User-Agent>.*))\n(Accept:(?<Accept>.*))\n(Origin:(?<Origin>.*))\n(Sec-Fetch-Site: (?<Sec-Fetch-Site>.*))\n(Sec-Fetch-Mode: (?<Sec-Fetch-Mode>.*))\n(Sec-Fetch-Dest: (?<Sec-Fetch-Dest>.*))\n(Referer: (?<Referer>.*))\n(Accept-Encoding: (?<Accept-Encoding>.*)\n(Accept-Language: (?<Accept-Language>.*))\n(Range: (?<Range>.*))";
            var web = new HtmlWeb();
            //var urlPtn = @"([GET|OPTIONS]\s(?<url>.*logo=[0-9]*))";
            var hostPtn = @"(Host:\s(?<host>.*)\r)";//for the headers parser 
            //get host from url 
            var hostptn2 = @"(https://(?<host>.+?))/";//for the param url  parser 
            Match match = Regex.Match(url, hostPtn, RegexOptions.IgnoreCase);

            var httpwebRequest = (HttpWebRequest)WebRequest.Create(url); // requests.get()

            try
            {
                // The two key item , host can be obtained from the last information and Referer have to obtained from fiddler
                match = Regex.Match(url, hostptn2, RegexOptions.IgnoreCase);
                httpwebRequest.Host = match.Groups["host"].Value;

                pattern = "(Range: (?<Range>.*))\r";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.AddRange(from, to);

                pattern = "(Referer: (?<Referer>.*))\r";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Referer = match.Groups["Referer"].Value;


                pattern = "(Origin:(?<Origin>.*))\r";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Headers["origin"] = match.Groups["Origin"].Value;

                httpwebRequest.Method = "Get";

                //httpwebRequest.ContentType = match.Groups["ContentType"].Value;
                //httpwebRequest.ContentLength=

                pattern = "(Accept:(?<Accept>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Accept = match.Groups["Accept"].Value; ;

                pattern = "(Accept-Encoding:(?<AcceptEncoding>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Headers["Accept-Encoding"] = match.Groups["AcceptEncoding"].Value;

                pattern = "(Accept-Language:(?<AcceptLanguage>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Headers["Accept-Language"] = match.Groups["AcceptLanguage"].Value;

                pattern = "(User-Agent:(?<UserAgent>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.UserAgent = match.Groups["UserAgent"].Value;


                httpwebRequest.Headers["Upgrade-Insecure-Requests"] = "1";
                httpwebRequest.KeepAlive = true;
                return httpwebRequest;

            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
            return null;

        }

其中的header 部分,是通过使用fiddler 软件跟踪分析获得,这个是下载的关键,但是由于各个网站的防盗技术,使得这个部分需要自己分析。(当然,如果有做浏览器的能力,这个也就简单了)
上述函数的调用方法

           index = 0;
            threadTll = validUrl.Count;
            partLen = totalLength / threadTll;

            String threadFile = "";
            List<DownLoadThread> dldThredLst = new List<DownLoadThread>();
            foreach (string url in validUrl)
            {

                //range = String.Format("{0}-{1}", partLen * index, partLen * (index + 1));
                threadFile = string.Format("{0}_{1}.{2}", fileName, index, fileType);

                //create a new thread 
                if (index + 1 < threadTll)
                {
                    httpRequest = myFileDownLoader.getBlblHttpReq3(url, partLen * index, partLen * (index + 1) - 1);
                }
                else
                {
                    httpRequest = myFileDownLoader.getBlblHttpReq3(url, partLen * index);

                }

                DownLoadThread dldThread = new DownLoadThread(httpRequest, threadFile,index, bVideo);
                dldThredLst.Add(dldThread);
                Thread httpRqstThread = new Thread(new ThreadStart(dldThread.saveStreamToFile));
                //Thread httpRqstThread = new Thread(new ThreadStart(dldThread.saveStreamToFileTest));
                //if (index == 3|| index==4)
                
                httpRqstThread.Start();
                


                //var streamReader = new StreamReader(httpwebResponse.GetResponseStream());
                fileLst.Add(threadFile);

                index++;
            }

函数解析:
第一步获得所有链接,这一步的原因是和具体的网站有关。笔者下载的是B站视频,该视频有多个下载地址,就可以利用多个下载地址对应不同的现成下载不同的部分,然后合成。同一个下载地址也可以使用这个技术,但是对于B站而言好像速度不能加快(可能是进行了限制)。根据线程数,计算每个分块的起止位置,在分别调用各个线程,这样多线程就实现了。
断点续传的实现
计算获得的stream的大小,然后计算新的起点位置,将其保存到注册表或者文件中,这样下次下载时,就可以使用这个值作为新的下载起点值。当然还要考虑很多其他问题,如下载文件的保存等等。
等有时间了,做一个断点续传的例子。
maraSun 于 BJFWDQ
2022-02-27

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值