C# 爬取小说 爬虫

 采用多线程爬取 , System.Data.SQLite 数据库存储

发现多线程 开太多 ,电脑性能不足 造成 部分下载不了,还占用过多资源

多线程 开少了 下载又慢, 只能说调到 中间部分 ,电脑刚刚能处理

代码是想到哪写到哪,架构混乱,只能说能跑起来

 NovelBookDetail

using System;
using System.Collections.Generic;
using System.Data.SQLite;
using System.Linq;
using System.Net.Http;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace novelDownload
{
    public class NovelBookDetail
    {
        public static string mainIp = "网站不贴出来了";
        public static string dbPath = @"D:\DB\Demo.db3";
        public static string db_Xianxia_table_Name = "Xianxia";
        public static string db_table_Name = "NovelDetail";
        public static Action<string> Log_callback;
        StringBuilder log = new StringBuilder();
        public string Log
        {
            get
            {
                return log.ToString();
            }
            set
            {
                Log_callback(value);
                log.Append(value);
            }
        }
        private List<BookChapter> list =new List<BookChapter>();
        private SQLiteHelper sqlite;
        public NovelBookDetail(NovelBook book)
        {
            sqlite = new SQLiteHelper(dbPath);
            Log = book.book_url + "\r\n";
            Down(book.book_url, fileContent =>
            {
                string pattern = @"<meta property=""og:description"" content=""(.+)\/>";
                book.description = Regex.Match(fileContent, pattern).Groups[1].Value;
                pattern = @"og:novel:update_time"" content=""(.+)"" \/>";
                book.lastUpdateTime = Regex.Match(fileContent, pattern).Groups[1].Value;
                pattern = @"og:novel:latest_chapter_name"" content=""(.+)"" \/>";
                book.latest_chapter_name = Regex.Match(fileContent, pattern).Groups[1].Value;
                pattern = @"(?<=正文\<\/dt\>)([.\S\s]*)(?=\<\/dl\>)";
                var result = fileContent.get_Match_Groups_1(pattern);
                result = result.Trim();
                book.all_chapter_name = result;
                book.page_source = fileContent;
                Log = (book.ToString());
                Update_db(book);
            });
        }
        static SQLiteHelper CreatDb()
        {
            var sqlite = new SQLiteHelper(dbPath);
            var colNames = new string[]
            {
            "book_id","book_name", "chapter_num", "chapter_title", "chapter_Url", "chapter_content",
            };
            var colTypes = Enumerable.Repeat("TEXT", colNames.Length).ToArray();
            sqlite.CreateTable(db_table_Name, colNames, colTypes);
            return sqlite;
        }
        public static void _01_Down_book_name(TextBox textBox)
        {
            NovelBooks.All_Down_mian(db_Xianxia_table_Name);
        }
        public static void _02_Down_chapter_name(TextBox textBox)
        {
            var date = SQLiteHelper.ReadNullTable(dbPath, db_Xianxia_table_Name, "page_source");
            NovelBookDetail.Log_callback = (x) => { textBox.AppendShow(x); };
            textBox.AppendText("read db count:" + date.Rows.Count);
            for (int i = 0; i < date.Rows.Count; i++)
            {
                NovelBook book = new NovelBook();
                var line = date.Rows[i];
                string colField_Name = "";
                colField_Name = "book_id";
                book.book_id = line[colField_Name].ToString();
                colField_Name = "db_addDate";
                book.db_addDate = line[colField_Name].ToString();
                colField_Name = "book_name";
                book.book_name = line[colField_Name].ToString();
                colField_Name = "type";
                book.type = line[colField_Name].ToString();
                colField_Name = "book_url";
                book.book_url = line[colField_Name].ToString();
                colField_Name = "author";
                book.author = line[colField_Name].ToString();
                colField_Name = "authorUrl";
                book.authorUrl = line[colField_Name].ToString();
                colField_Name = "page_source";
                book.page_source = line[colField_Name].ToString();
                ThreadPool.QueueUserWorkItem(
                    p =>
                    {
                        // 将object转成数组
                        var objArr = (NovelBook)book;
                        var nbd = new NovelBookDetail(objArr);
                    }, book);
            }
        }
        async Task Down(string ip = "https://www.cnblogs.com", Action<string> func = null)
        {
            var client = new HttpClient();
            string response = await client.GetStringAsync(ip);
            if (func != null)
            {
                func(response);
            }
            //Console.WriteLine(response);
        }
        public static int downChapterNum = 0,JumpNum=0,jump_book;
        public static void _03_Down_all_chapter(TextBox textBox)
        {
            ThreadPool.SetMinThreads(1, 1);
            ThreadPool.SetMaxThreads(50, 50);
            string sql = "SELECT book_id,book_name,all_chapter_name FROM " + db_Xianxia_table_Name;
            var date = SQLiteHelper.ExecuteDataTable(dbPath, db_Xianxia_table_Name, sql);
            string mianCount = ("From Table '" + db_Xianxia_table_Name+"' read date count "+ date.Rows.Count);
            string chapterCount = "";
            var sqlite2 = CreatDb();
            StringBuilder stringBuilder =new StringBuilder();
            for (int i = 0; i < date.Rows.Count; i++)
            {
            
                var line = date.Rows[i];
                string colField_Name = "";
                colField_Name = "book_id";
                var book_id = line[colField_Name].ToString();
                colField_Name = "book_name";
                var book_name = line[colField_Name].ToString();
                colField_Name = "all_chapter_name";
                var all_chapter_name = line[colField_Name].ToString();
                var lines = all_chapter_name.Split(new string[] {"\r\n"}, System.StringSplitOptions.None);
                var chapters = lines.Where(x => x.Length > 75 && x.Contains(@"<dd> <a style="""" href="""))
                    .Select(x => x).ToArray();
                chapterCount =  ("book '" + book_name + "' chapters count " + chapters.Length);
                var pattern_url = @"href=""(.*)"">";
                string pattern_chapter_name = @""">(.*)<\/a>";
                
                for (int j = 0; j <  chapters.Length; j++)
                {
                    stringBuilder.Remove(0, stringBuilder.Length);//清空StringBuilder的方法
                    stringBuilder.Append(mianCount);
                    stringBuilder.Append("\r\n\r\n");
                    stringBuilder.Append(chapterCount);
                    stringBuilder.Append("\r\n\r\n");
                    BookChapter bookChapter = new BookChapter();
                    bookChapter.book_id = book_id;
                    bookChapter.book_name = book_name;
                    //如果最后一章都下载了话 跳过当前这本书
                    if (j == 0 && IsDownload(bookChapter.book_id, (chapters.Length - 1)+""))
                    {
                        jump_book++;
                        stringBuilder.Append("book all chapters is download");
                        textBox.SetShow(stringBuilder.ToString());
                        break;
                    }
                    var itemLine= chapters[j];
                    bookChapter.chapter_Url = mainIp + itemLine.get_Match_Groups_1(pattern_url);
                    bookChapter.chapter_title = itemLine.get_Match_Groups_1(pattern_chapter_name);
                    bookChapter.chapter_num = j + 1 + "";
    
                    if (IsDownload(bookChapter.book_id,bookChapter.chapter_num))
                    {
                        JumpNum++;
                        stringBuilder.Append("is download '" + bookChapter.book_name  + "' chapter " + bookChapter.chapter_num + "\r\n\r\n");
                        stringBuilder.Append("download chapter count " + downChapterNum + " ,jump chapter count " + JumpNum + " ,jump book count " + jump_book + "\r\n");
                        textBox.SetShow(stringBuilder.ToString());
                        continue;
                    }
                    else
                    {
                        downChapterNum++;
                        stringBuilder.Append("start download '" + bookChapter.book_name  + "' chapter " + bookChapter.chapter_num + "\r\n\r\n");
                        stringBuilder.Append("download chapter count " + downChapterNum + " ,jump chapter count " + JumpNum + " ,jump book count " +jump_book+ "\r\n");
                        textBox.SetShow(stringBuilder.ToString());
                        Down_chapter(bookChapter, sqlite2);
                    }
                }
            }
        }
      public static  bool IsDownload(string book_id, string chapter_num)
        {
            //SELECT * FROM NovelDetail WHERE book_id = '21663' AND chapter_num = '2'
            StringBuilder sql =new StringBuilder();
            sql.Append("SELECT * FROM ");
            sql.Append(db_table_Name);
            sql.Append(" WHERE book_id = :book_id");
            sql.Append(" AND chapter_num = :chapter_num");
            SQLiteParameter[] parameters = new SQLiteParameter[]
            {
                new SQLiteParameter(":book_id", book_id),
                new SQLiteParameter(":chapter_num",chapter_num),
            };
            var effctLines = SQLiteHelper.ExecuteDataTable(dbPath, sql.ToString(), parameters).Rows.Count;
            if (effctLines > 0)
            {
                return true;
            }
            return false;
        }
        void Update_db(NovelBook book)
        {
            StringBuilder sb = new StringBuilder();
            sb.Append("update Xianxia set ");
            sb.Append(" lastUpdateTime =:lastUpdateTime,");
            sb.Append(" latest_chapter_name =:latest_chapter_name,");
            sb.Append(" page_source =:page_source,");
            sb.Append(" all_chapter_name =:all_chapter_name,");
            sb.Append(" description =:description");
            sb.Append(" where book_id=:book_id");
            SQLiteParameter[] parameters = new SQLiteParameter[]
            {
            new SQLiteParameter(":lastUpdateTime", book.lastUpdateTime),
            new SQLiteParameter(":latest_chapter_name",book.latest_chapter_name),
            new SQLiteParameter(":page_source", book.page_source),
            new SQLiteParameter(":all_chapter_name",book.all_chapter_name),
            new SQLiteParameter(":description", book.description),
            new SQLiteParameter(":book_id", book.book_id)
            };
            int affectedRows = sqlite.ExecuteNonQuery(sb.ToString(), parameters);
            var b = affectedRows;
            Log = ("Update_db affectedRows:" + affectedRows + "\r\n");
            //  UPDATE Xianxia SET description = 'aa', lastUpdateTime = 'bb' WHERE book_id = '91791'
        }
        public static void Down_chapter(BookChapter bookChapter, SQLiteHelper sqlite)
        {
            Console.WriteLine();
            object[] objectArray = new object[3];//这里的2就是改成你要传递几个参数
                objectArray[0] = bookChapter;
                objectArray[1] = sqlite;
                objectArray[2] = db_table_Name;
                object param = (object)objectArray;
                ThreadPool.QueueUserWorkItem(p =>
                    {
                        // 将object转成数组
                        object[] objArr = (object[])param;
                        var param1 = (BookChapter)objArr[0];
                        var param2 = (SQLiteHelper)objArr[1];
                        var param3 = (string)objArr[2];
                        Down(param1, (fileContent, bookChapter2) =>
                        {
                            var cc = new BookChapter(bookChapter2, fileContent);
                            param2.InsertPublicField(param3, cc);
                        });
                    }, param);
        }
        public static async Task Down(BookChapter bc, Action<string, BookChapter> func)
        {
            try
            {
                var client = new HttpClient();
                string response = await client.GetStringAsync(bc.chapter_Url);
                if (func != null)
                {
                    func(response, bc);
                }
                Console.WriteLine("\r\nchapter_num:" + bc.chapter_Url + "\r\nresponse:" + response);
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                throw new Exception(e.Message);
            }
        }
        
    }
}

BookChapter 

using System.Text.RegularExpressions;

namespace novelDownload
{
    public class BookChapter
    {
        public string book_id;
        public string book_name;
        public string chapter_num;
        public string chapter_title;
        public string chapter_Url;
        public string chapter_content;


        public override string ToString()
        {
            return $"{nameof(book_id)}: {book_id}\r\n{nameof(book_name)}: {book_name}\r\n{nameof(chapter_num)}: {chapter_num}\r\n{nameof(chapter_title)}: {chapter_title}\r\n{nameof(chapter_Url)}: {chapter_Url}\r\n{nameof(chapter_content)}: {chapter_content}";
        }

        public BookChapter() { }
        public BookChapter(BookChapter bc, string fileContent)
        {
            bc.CopyPublicFieldTo(this);

            string pattern = @"(?<=<div id=""content"" deep=""3"">)([.\S\s]*)(?=<div align=""center"">)";
            var chapter_content = "";
            Match result = Regex.Match(fileContent, pattern);
            if (result.Success)
            {
                chapter_content = result.Value; //此为匹配出的值
                pattern = @"(?<=<p>)([.\S\s]*)(?=<\/p>)";
                chapter_content = Regex.Replace(chapter_content, pattern, "");
                pattern = @"&nbsp;&nbsp;&nbsp;&nbsp;";
                chapter_content = Regex.Replace(chapter_content, pattern, "\t");
                pattern = @"<br\/><br\/>";
                chapter_content = Regex.Replace(chapter_content, pattern, "\r\n");
                pattern = @"<p><\/p>";
                chapter_content = Regex.Replace(chapter_content, pattern, "");
                pattern = @"^\s*\n";
                chapter_content = Regex.Replace(chapter_content, pattern, "");
                pattern = @"(未完待续……)";
                chapter_content = Regex.Replace(chapter_content, pattern, "");

                this.chapter_content = chapter_content.Trim();

            }
        }
    }
}

 NovelBooks

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace novelDownload
{
    public class NovelBooks
    {
        private string mainIp = "https://www.网址.com/";
        public static string dbPath = @"D:\DB\Demo.db3";
        List<NovelBook> list = new List<NovelBook>();
        public NovelBooks(string file_content, TextBox textBox2 = null)
        {
            var start = "<div class=\"l\">";
            var end = "<div class=\"page_b page_b2\">";
            var re = file_content.PickUpCenter(start, end);
            textBox2?.AppendShow(re);
            start = "<li>";
            string[] sArr = re.Split(new string[] { start }, StringSplitOptions.None);
            foreach (var VARIABLE in sArr)
            {
                textBox2?.AppendShow(VARIABLE);
                textBox2?.AppendShow("==================");
            }
            for (int i = 0; i < sArr.Length; i++)
            {
                var item = sArr[i];
                NovelBook book = new NovelBook();
                string pattern = @"<span class=""s1"">\[(.+)\]<\/span>";
                book.type = Regex.Match(item, pattern).Groups[1].Value;
                pattern = @"<span class=""s2""><a href=""\/\d+\/\d+\/"" target=""_blank"">(.+)<\/a><\/span>";
                book.book_name = Regex.Match(item, pattern).Groups[1].Value;
                pattern = @"<span class=""s2""><a href=""(.+)"" target=""_blank"">";
                book.book_url = mainIp + Regex.Match(item, pattern).Groups[1].Value;
                pattern = @"https:\/\/www.网址.com\/\d+\/(\d+)\/";
                book.book_id = Regex.Match(book.book_url, pattern).Groups[1].Value;
                pattern = @"<span class=""s4""><a href=""(.+)"">";
                book.authorUrl = mainIp + Regex.Match(item, pattern).Groups[1].Value;
                pattern = @"<span class=""s4""><a href=""\/author\/\d+\/"">(.+)<\/a><\/span>";
                book.author = Regex.Match(item, pattern).Groups[1].Value;
                textBox2?.AppendShow(book.ToString() + "\r\n");
                bool isHasDate = book.IsNoEmpty();
                if (isHasDate) list.Add(book);
            }
        }
        static SQLiteHelper CreatDb(string db_Table_Name)
        {
            var sqlite = new SQLiteHelper(dbPath);
            var colNames = new string[]
            {
                "book_id", "db_addDate", "book_name", "type", "book_url", "author", "authorUrl", "description",
                "lastUpdateTime", "latest_chapter_name", "all_chapter_name","page_source",
            };
            var colTypes = Enumerable.Repeat("TEXT", colNames.Length).ToArray();
            sqlite.CreateTable(db_Table_Name, colNames, colTypes);
            return sqlite;
        }
        public static void All_Down_mian(string db_Table_Name)
        {
            var sqlite = CreatDb(db_Table_Name);
            string ip_forward = "https://www.网址.com/xianxia_";
            string ip_center = "";
            string ip_end = ".html";
            string ip = "";
            for (int i = 1; i < 188; i++)
            {
                ip_center = i.ToString();
                ip = ip_forward + ip_center + ip_end;
                object[] objectArray = new object[3];//这里的2就是改成你要传递几个参数
                objectArray[0] = ip;
                objectArray[1] = sqlite;
                objectArray[2] = db_Table_Name;
                object param = (object)objectArray;
                ThreadPool.QueueUserWorkItem(
                    p =>
                    {
                        // 将object转成数组
                        object[] objArr = (object[])param;
                        var param1 = (string)objArr[0];
                        var param2 = (SQLiteHelper)objArr[1];
                        var param3 = (string)objArr[2];
                        Down(param1, fileContent =>
                        {
                            var books = new NovelBooks(fileContent);
                            param2.BatchInsertPublicFields(param3, books.list);
                        });
                    }, param);
            }
        }
        public static async Task Down(string ip = "https://www.cnblogs.com", Action<string> func = null)
        {
            NovelBookDetail.Log_callback("down ip:" + ip + "\r\n");
            var client = new HttpClient();
            string response = await client.GetStringAsync(ip);
            if (func != null)
            {
                func(response);
            }
            //Console.WriteLine(response);
        }
    }
}

NovelBook 

    public class NovelBook
    {
        public string book_id;
        public string db_addDate;
        public string book_name;
        public string type;
        public string book_url;
        public string author;
        public string authorUrl;
        public string description;// 简介
        public string lastUpdateTime; //最后一次更新时间
        public string latest_chapter_name; //最后一次章节名称
        public string all_chapter_name; //所有章节名称
        public string page_source; //所有章节名称
    }

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值