以编程方式抓取苏宁电子发票

电子发票通常是以pdf文件存在,各大电商几乎都提供电子发票,如京东、淘宝(天猫)、苏宁易购、携程、中国联通、电信、移动等平台。那么,我们如何以编程方式爬取这些平台的电子发票呢?好了,这里我直接上代码供参考,实际上,经测试,各大电商平台的电子发票都是可以爬取的。欢迎加qq283335746共同探讨以编程方式爬虫抓取数据信息。

/// <summary>
    /// 苏宁易购(suning.com)
    /// </summary>
    public class Suning
    {
        private Suning() { }

        public Suning(string userId, string cookieAppend)
        {
            this._client = new NetClient(BaseUrl);
            this._invoiceBll = new ThirdPartyBll(userId, Common.CookieFilter(cookieAppend, string.Format(@"\s*{0}\s*=\s*(.|\n)*?;", "authId")), ThirdPartyOptions.Suning);
            this._thirdPartyRequest = new ThirdPartyRequest(cookieAppend);
            this.OrderInvoices = new List<DownloadInvoiceInfo>();
        }

        private const string BaseUrl = "https://order.suning.com";
        private const string OrderInvoicesRefererUrl = "https://order.suning.com/invoice/invoiceList.do";
        private const string OrderInvoicesUrl = "https://order.suning.com/invoice/queryInvoiceList.do";
        private const string InvoiceDetailUrl = "https://order.suning.com/invoice/queryInvoiceDetail.do?orderId={0}&vendorCode=0000000000";

        private readonly NetClient _client;
        private readonly ThirdPartyBll _invoiceBll;
        private readonly ThirdPartyRequest _thirdPartyRequest;
        private List<DownloadInvoiceInfo> OrderInvoices;

        /// <summary>
        /// 发票处理入口
        /// </summary>
        /// <returns></returns>
        public async Task ExecuteInvoiceAsync()
        {
            if (Log.IsDebugEnabled) Log.Debug("Suning.ExecuteInvoiceAsync is starting");

            //下载并解析出订单发票信息
            await GetOrderInvoicesAsync();

            //保存得到的发票信息集
            await _invoiceBll.SaveDownloadInvoice(OrderInvoices);
        }

        /// <summary>
        /// 获取包含电子发票的订单列表
        /// </summary>
        /// <returns></returns>
        private async Task GetOrderInvoicesAsync()
        {
            if (Log.IsDebugEnabled) Log.Debug("Suning.GetOrderInvoicesAsync 11 is starting");

            var request = _thirdPartyRequest.CreateRequest(OrderInvoicesUrl, OrderInvoicesRefererUrl);
            var response = await _client.ExecuteAsync(request);

            if (Log.IsDebugEnabled) Log.Debug("Suning.GetOrderInvoicesAsync 12,response.ResponseUri:{0},response.ContentType:{1},ContentLength:{2}", response.ResponseUri, response.ContentType, response.ContentLength);

            if (Log.IsDebugEnabled) Log.Debug("response.Content:{0}", response.Content);

            if (response.ResponseUri != null && response.ResponseUri.AbsolutePath.Contains("login")) throw new CustomException(MC.M_ThirdPartySiteLoginNeeding);
            if (response.ContentLength == 0) return;

            //var tempLogFile = FileHelper.GetFilePath("Suning", string.Format("{0}.html",Guid.NewGuid()));
            //await FileHelper.Save(response.RawBytes, tempLogFile);
            //System.IO.File.WriteAllText(tempLogFile, response.Content);
            //if (Log.IsDebugEnabled) Log.Debug("Suning.GetOrderInvoicesAsync 13,{0}", tempLogFile);

            var document = HDocument.Parse(response.Content);
            var invoiceDetailNodes = document.Descendants("a").Where(x => x.Attribute("href") != null && x.Attribute("href").Value.Contains("invoiceDetail.do"));
            if (invoiceDetailNodes == null || !invoiceDetailNodes.Any()) return;

            if (Log.IsDebugEnabled) Log.Debug("Suning.GetOrderInvoicesAsync 14 is starting");

            foreach (var item in invoiceDetailNodes)
            {
                var invoiceUrl = await GetOrderDetailsAsync(item.Attribute("href").Value);
                var downloadInvoiceInfo = await DownloadInvoiceAsync(invoiceUrl);
                if (downloadInvoiceInfo == null) continue;

                OrderInvoices.Add(downloadInvoiceInfo);
            }
        }

        /// <summary>
        /// 获取订单详情页,并解析出电子发票的下载链接
        /// </summary>
        /// <param name="referer"></param>
        /// <returns></returns>
        private async Task<string> GetOrderDetailsAsync(string referer)
        {
            if (Log.IsDebugEnabled) Log.Debug("Suning.GetOrderDetailsAsync 21 is starting");

            var request = _thirdPartyRequest.CreateRequest(string.Format(InvoiceDetailUrl, Common.GetQueryString(referer, "orderId")), referer);
            var response = await _client.ExecuteAsync(request);

            if (Log.IsDebugEnabled) Log.Debug("Suning.GetOrderDetailsAsync 22,response.ResponseUri:{0}, ContentLength:{1}", response.ResponseUri, response.ContentLength);

            if (response.ContentLength == 0) return null;

            var document = HDocument.Parse(response.Content);
            var invoiceNode = document.Descendants("a").FirstOrDefault(x => x.Value.Contains("查看发票"));

            return invoiceNode?.Attribute("href").Value;
        }

        /// <summary>
        /// 下载并解析出电子发票信息
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private async Task<DownloadInvoiceInfo> DownloadInvoiceAsync(string url)
        {
            if (Log.IsDebugEnabled) Log.Debug("Suning.DownloadInvoiceAsync 31 is starting -------------");

            var request = _thirdPartyRequest.CreateRequest(url);
            var response = await _client.ExecuteAsync(request);

            if (Log.IsDebugEnabled) Log.Debug("Suning.DownloadInvoiceAsync 32,response.ResponseUri:{0}, ContentLength:{1}", response.ResponseUri, response.ContentLength);

            if (response.ContentLength == 0) return null;

            var tempPath = FileHelper.GetTempFileName(string.Format("{0}.pdf", Guid.NewGuid()));
            await FileHelper.Save(response.RawBytes, tempPath);

            var invoiceInfo = await _invoiceBll.GetInvoiceInfoByFile(tempPath);
            if (string.IsNullOrEmpty(invoiceInfo.InvoiceCode) || string.IsNullOrEmpty(invoiceInfo.InvoiceNumber)) return null;

            //解析出发票信息后,将临时文件移至正式存储路径
            var filePath = FileHelper.GetFilePath("Suning", string.Format("{0}-{1}.pdf", invoiceInfo.InvoiceCode, invoiceInfo.InvoiceNumber));
            if (!File.Exists(filePath))
            {
                FileHelper.MoveFile(tempPath, filePath);
                FileHelper.MoveFile(tempPath.Replace(".pdf", ".jpg"), filePath.Replace(".pdf", ".jpg"));
            }
                
            invoiceInfo.FileUrl = FileHelper.ToVirtualUrl(filePath);
            invoiceInfo.Picture = invoiceInfo.FileUrl.Replace(".pdf", ".jpg");

            return new DownloadInvoiceInfo { FilePath = filePath, InvoiceInfo = invoiceInfo, DownloadUrl = request.Resource };
        }
    }
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值