电子发票通常是以pdf文件存在,各大电商几乎都提供电子发票,如京东、淘宝(天猫)、苏宁易购、携程、中国联通、电信、移动等平台。那么,我们如何以编程方式爬取这些平台的电子发票呢?好了,这里我直接上代码供参考,实际上,经测试,各大电商平台的电子发票都是可以爬取的。欢迎加qq283335746共同探讨以编程方式爬虫抓取数据信息。
/// <summary>
/// 中国联通(10010.com)
/// </summary>
public class ChinaUnicom
{
private ChinaUnicom() { }
public ChinaUnicom(string userId, string cookieAppend)
{
this._userId = userId;
this._userCookie = cookieAppend;
this._relationUserId = Common.CookieFilter(_userCookie, string.Format(@"\s*{0}\s*=\s*(.|\n)*?;", "_uop_id"));
this.OrderInvoices = new List<UnicomOrderInvoiceInfo>();
this._invoiceBll = new ThirdPartyBll(userId, _relationUserId, ThirdPartyOptions.ChinaUnicom);
this._thirdPartyRequest = new ThirdPartyRequest(cookieAppend);
this._client = new NetClient(BaseUrl);
}
private const string BaseUrl = "http://wap.10010.com";
private const string LoginPartialUrl = "uac.10010.com/oauth2/new_auth";
private const string QueryInvoicesRefererUrl = "http://wap.10010.com/mobileService/query/einvoice.htm?navUrlCode=1307&menuId=000200060012";
private const string QueryInvoicesUrl = "http://wap.10010.com/mobileService/query/einvoicelist.htm";
private readonly string _userId = string.Empty;
private readonly string _userCookie = string.Empty;
private readonly string _relationUserId = string.Empty;
private readonly NetClient _client;
private readonly ThirdPartyBll _invoiceBll;
private readonly ThirdPartyRequest _thirdPartyRequest;
private List<UnicomOrderInvoiceInfo> OrderInvoices { get; set; }
/// <summary>
/// 发票处理入口
/// </summary>
/// <returns></returns>
public async Task ExecuteInvoiceAsync()
{
var request = _thirdPartyRequest.CreateRequest(QueryInvoicesUrl, MethodOptions.Post);
request.AddParameter(Dcr.ContentTypeKey, Dcr.ContentType, ParameterType.HttpHeader);
//获取所有包含发票相关信息的订单集
await GetOrderInvoicesAsync(request, 1, 40, "record");
if (!OrderInvoices.Any())
{
if (Log.IsDebugEnabled) Log.Debug("ChinaUnicom.ExecuteInvoiceAsync,UserId:{0},RelationUserId:{1},not any invoice info! data from OrderInvoices.", _userId, _relationUserId);
return;
}
var unicomInvoices = ToUnicomInvoices(OrderInvoices);
if (unicomInvoices == null || !unicomInvoices.Any())
{
if (Log.IsDebugEnabled) Log.Debug("ChinaUnicom.ExecuteInvoiceAsync,not any invoice info! data from unicomInvoices.");
return;
}
var oldInvoices = await _invoiceBll.GetUserThirdPartyTInvoices();
foreach (var item in unicomInvoices)
{
var filePath = await DownloadInvoiceAsync(item);
if (string.IsNullOrEmpty(filePath)) continue;
var oldInfo = oldInvoices?.FirstOrDefault(m => m.InvoiceCode == item.invoicecode && m.InvoiceNumber == item.invoicenum);
if (oldInfo != null) continue;
//await invoiceBll.SaveThirdPartyInvoice(item.busiorder, item.invoiceurl, filePath);
await _invoiceBll.SaveInvoice(filePath, string.Empty);
}
}
/// <summary>
/// 获取包含发票相关信息的订单集
/// </summary>
/// <param name="request"></param>
/// <param name="pageIndex"></param>
/// <param name="pageSize"></param>
/// <param name="type"></param>
/// <returns></returns>
private async Task GetOrderInvoicesAsync(NetRequest request, int pageIndex, int pageSize, string type)
{
request.AddParameter("page", pageIndex.ToString());
request.AddParameter("pageSize", pageSize.ToString());
request.AddParameter("type", type);
var response = await _client.ExecuteAsync(request);
if (response.ResponseUri.ToString().Contains(LoginPartialUrl))
throw new CustomException(MC.M_ThirdPartySiteLoginNeeding);
if (string.IsNullOrEmpty(response.Content)) return;
var orderInvoiceInfo = JsonConvert.DeserializeObject<UnicomOrderInvoiceInfo>(response.Content);
if (orderInvoiceInfo?.totalmap == null || !orderInvoiceInfo.totalmap.Any())
{
if (Log.IsDebugEnabled) Log.Debug("ChinaUnicom.GetOrderInvoicesAsync,not has invoiceinfo! request Resource is : {0}", request.Resource);
return;
}
OrderInvoices.Add(orderInvoiceInfo);
}
/// <summary>
/// 下载并保存电子发票PDF文件,返回文件存储的物理路径
/// </summary>
/// <param name="invoiceInfo"></param>
/// <returns></returns>
private async Task<string> DownloadInvoiceAsync(UnicomInvoiceInfo invoiceInfo)
{
if (string.IsNullOrEmpty(invoiceInfo.invoicecode) || string.IsNullOrEmpty(invoiceInfo.invoicenum)) return string.Empty;
var filePath = FileHelper.GetFilePath("ChinaUnicom", string.Format(@"{0}-{1}.pdf", invoiceInfo.invoicecode, invoiceInfo.invoicenum));
if (File.Exists(filePath)) return filePath;
var request = _thirdPartyRequest.CreateRequest(invoiceInfo.invoiceurl, QueryInvoicesUrl);
var response = await _client.ExecuteAsync(request);
if (response.ContentLength == 0 || response.ContentType != Dcr.PdfContentType) return string.Empty;
await FileHelper.Save(response.RawBytes, filePath);
return filePath;
}
/// <summary>
/// 将订单发票信息集过滤出发票信息集,过滤无用数据
/// </summary>
/// <param name="unicomOrderInvoices"></param>
/// <returns></returns>
private static IEnumerable<UnicomInvoiceInfo> ToUnicomInvoices(IEnumerable<UnicomOrderInvoiceInfo> unicomOrderInvoices)
{
var unicomInvoices = from r in unicomOrderInvoices
let t = r.totalmap
from v in t
let invs = v.value
from inv in invs
select inv;
return unicomInvoices;
}
}