采用多线程爬取 , System.Data.SQLite 数据库存储
发现多线程 开太多 ,电脑性能不足 造成 部分下载不了,还占用过多资源
多线程 开少了 下载又慢, 只能说调到 中间部分 ,电脑刚刚能处理
代码是想到哪写到哪,架构混乱,只能说能跑起来
NovelBookDetail
using System;
using System.Collections.Generic;
using System.Data.SQLite;
using System.Linq;
using System.Net.Http;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace novelDownload
{
public class NovelBookDetail
{
public static string mainIp = "网站不贴出来了";
public static string dbPath = @"D:\DB\Demo.db3";
public static string db_Xianxia_table_Name = "Xianxia";
public static string db_table_Name = "NovelDetail";
public static Action<string> Log_callback;
StringBuilder log = new StringBuilder();
public string Log
{
get
{
return log.ToString();
}
set
{
Log_callback(value);
log.Append(value);
}
}
private List<BookChapter> list =new List<BookChapter>();
private SQLiteHelper sqlite;
public NovelBookDetail(NovelBook book)
{
sqlite = new SQLiteHelper(dbPath);
Log = book.book_url + "\r\n";
Down(book.book_url, fileContent =>
{
string pattern = @"<meta property=""og:description"" content=""(.+)\/>";
book.description = Regex.Match(fileContent, pattern).Groups[1].Value;
pattern = @"og:novel:update_time"" content=""(.+)"" \/>";
book.lastUpdateTime = Regex.Match(fileContent, pattern).Groups[1].Value;
pattern = @"og:novel:latest_chapter_name"" content=""(.+)"" \/>";
book.latest_chapter_name = Regex.Match(fileContent, pattern).Groups[1].Value;
pattern = @"(?<=正文\<\/dt\>)([.\S\s]*)(?=\<\/dl\>)";
var result = fileContent.get_Match_Groups_1(pattern);
result = result.Trim();
book.all_chapter_name = result;
book.page_source = fileContent;
Log = (book.ToString());
Update_db(book);
});
}
static SQLiteHelper CreatDb()
{
var sqlite = new SQLiteHelper(dbPath);
var colNames = new string[]
{
"book_id","book_name", "chapter_num", "chapter_title", "chapter_Url", "chapter_content",
};
var colTypes = Enumerable.Repeat("TEXT", colNames.Length).ToArray();
sqlite.CreateTable(db_table_Name, colNames, colTypes);
return sqlite;
}
public static void _01_Down_book_name(TextBox textBox)
{
NovelBooks.All_Down_mian(db_Xianxia_table_Name);
}
public static void _02_Down_chapter_name(TextBox textBox)
{
var date = SQLiteHelper.ReadNullTable(dbPath, db_Xianxia_table_Name, "page_source");
NovelBookDetail.Log_callback = (x) => { textBox.AppendShow(x); };
textBox.AppendText("read db count:" + date.Rows.Count);
for (int i = 0; i < date.Rows.Count; i++)
{
NovelBook book = new NovelBook();
var line = date.Rows[i];
string colField_Name = "";
colField_Name = "book_id";
book.book_id = line[colField_Name].ToString();
colField_Name = "db_addDate";
book.db_addDate = line[colField_Name].ToString();
colField_Name = "book_name";
book.book_name = line[colField_Name].ToString();
colField_Name = "type";
book.type = line[colField_Name].ToString();
colField_Name = "book_url";
book.book_url = line[colField_Name].ToString();
colField_Name = "author";
book.author = line[colField_Name].ToString();
colField_Name = "authorUrl";
book.authorUrl = line[colField_Name].ToString();
colField_Name = "page_source";
book.page_source = line[colField_Name].ToString();
ThreadPool.QueueUserWorkItem(
p =>
{
// 将object转成数组
var objArr = (NovelBook)book;
var nbd = new NovelBookDetail(objArr);
}, book);
}
}
async Task Down(string ip = "https://www.cnblogs.com", Action<string> func = null)
{
var client = new HttpClient();
string response = await client.GetStringAsync(ip);
if (func != null)
{
func(response);
}
//Console.WriteLine(response);
}
public static int downChapterNum = 0,JumpNum=0,jump_book;
public static void _03_Down_all_chapter(TextBox textBox)
{
ThreadPool.SetMinThreads(1, 1);
ThreadPool.SetMaxThreads(50, 50);
string sql = "SELECT book_id,book_name,all_chapter_name FROM " + db_Xianxia_table_Name;
var date = SQLiteHelper.ExecuteDataTable(dbPath, db_Xianxia_table_Name, sql);
string mianCount = ("From Table '" + db_Xianxia_table_Name+"' read date count "+ date.Rows.Count);
string chapterCount = "";
var sqlite2 = CreatDb();
StringBuilder stringBuilder =new StringBuilder();
for (int i = 0; i < date.Rows.Count; i++)
{
var line = date.Rows[i];
string colField_Name = "";
colField_Name = "book_id";
var book_id = line[colField_Name].ToString();
colField_Name = "book_name";
var book_name = line[colField_Name].ToString();
colField_Name = "all_chapter_name";
var all_chapter_name = line[colField_Name].ToString();
var lines = all_chapter_name.Split(new string[] {"\r\n"}, System.StringSplitOptions.None);
var chapters = lines.Where(x => x.Length > 75 && x.Contains(@"<dd> <a style="""" href="""))
.Select(x => x).ToArray();
chapterCount = ("book '" + book_name + "' chapters count " + chapters.Length);
var pattern_url = @"href=""(.*)"">";
string pattern_chapter_name = @""">(.*)<\/a>";
for (int j = 0; j < chapters.Length; j++)
{
stringBuilder.Remove(0, stringBuilder.Length);//清空StringBuilder的方法
stringBuilder.Append(mianCount);
stringBuilder.Append("\r\n\r\n");
stringBuilder.Append(chapterCount);
stringBuilder.Append("\r\n\r\n");
BookChapter bookChapter = new BookChapter();
bookChapter.book_id = book_id;
bookChapter.book_name = book_name;
//如果最后一章都下载了话 跳过当前这本书
if (j == 0 && IsDownload(bookChapter.book_id, (chapters.Length - 1)+""))
{
jump_book++;
stringBuilder.Append("book all chapters is download");
textBox.SetShow(stringBuilder.ToString());
break;
}
var itemLine= chapters[j];
bookChapter.chapter_Url = mainIp + itemLine.get_Match_Groups_1(pattern_url);
bookChapter.chapter_title = itemLine.get_Match_Groups_1(pattern_chapter_name);
bookChapter.chapter_num = j + 1 + "";
if (IsDownload(bookChapter.book_id,bookChapter.chapter_num))
{
JumpNum++;
stringBuilder.Append("is download '" + bookChapter.book_name + "' chapter " + bookChapter.chapter_num + "\r\n\r\n");
stringBuilder.Append("download chapter count " + downChapterNum + " ,jump chapter count " + JumpNum + " ,jump book count " + jump_book + "\r\n");
textBox.SetShow(stringBuilder.ToString());
continue;
}
else
{
downChapterNum++;
stringBuilder.Append("start download '" + bookChapter.book_name + "' chapter " + bookChapter.chapter_num + "\r\n\r\n");
stringBuilder.Append("download chapter count " + downChapterNum + " ,jump chapter count " + JumpNum + " ,jump book count " +jump_book+ "\r\n");
textBox.SetShow(stringBuilder.ToString());
Down_chapter(bookChapter, sqlite2);
}
}
}
}
public static bool IsDownload(string book_id, string chapter_num)
{
//SELECT * FROM NovelDetail WHERE book_id = '21663' AND chapter_num = '2'
StringBuilder sql =new StringBuilder();
sql.Append("SELECT * FROM ");
sql.Append(db_table_Name);
sql.Append(" WHERE book_id = :book_id");
sql.Append(" AND chapter_num = :chapter_num");
SQLiteParameter[] parameters = new SQLiteParameter[]
{
new SQLiteParameter(":book_id", book_id),
new SQLiteParameter(":chapter_num",chapter_num),
};
var effctLines = SQLiteHelper.ExecuteDataTable(dbPath, sql.ToString(), parameters).Rows.Count;
if (effctLines > 0)
{
return true;
}
return false;
}
void Update_db(NovelBook book)
{
StringBuilder sb = new StringBuilder();
sb.Append("update Xianxia set ");
sb.Append(" lastUpdateTime =:lastUpdateTime,");
sb.Append(" latest_chapter_name =:latest_chapter_name,");
sb.Append(" page_source =:page_source,");
sb.Append(" all_chapter_name =:all_chapter_name,");
sb.Append(" description =:description");
sb.Append(" where book_id=:book_id");
SQLiteParameter[] parameters = new SQLiteParameter[]
{
new SQLiteParameter(":lastUpdateTime", book.lastUpdateTime),
new SQLiteParameter(":latest_chapter_name",book.latest_chapter_name),
new SQLiteParameter(":page_source", book.page_source),
new SQLiteParameter(":all_chapter_name",book.all_chapter_name),
new SQLiteParameter(":description", book.description),
new SQLiteParameter(":book_id", book.book_id)
};
int affectedRows = sqlite.ExecuteNonQuery(sb.ToString(), parameters);
var b = affectedRows;
Log = ("Update_db affectedRows:" + affectedRows + "\r\n");
// UPDATE Xianxia SET description = 'aa', lastUpdateTime = 'bb' WHERE book_id = '91791'
}
public static void Down_chapter(BookChapter bookChapter, SQLiteHelper sqlite)
{
Console.WriteLine();
object[] objectArray = new object[3];//这里的2就是改成你要传递几个参数
objectArray[0] = bookChapter;
objectArray[1] = sqlite;
objectArray[2] = db_table_Name;
object param = (object)objectArray;
ThreadPool.QueueUserWorkItem(p =>
{
// 将object转成数组
object[] objArr = (object[])param;
var param1 = (BookChapter)objArr[0];
var param2 = (SQLiteHelper)objArr[1];
var param3 = (string)objArr[2];
Down(param1, (fileContent, bookChapter2) =>
{
var cc = new BookChapter(bookChapter2, fileContent);
param2.InsertPublicField(param3, cc);
});
}, param);
}
public static async Task Down(BookChapter bc, Action<string, BookChapter> func)
{
try
{
var client = new HttpClient();
string response = await client.GetStringAsync(bc.chapter_Url);
if (func != null)
{
func(response, bc);
}
Console.WriteLine("\r\nchapter_num:" + bc.chapter_Url + "\r\nresponse:" + response);
}
catch (Exception e)
{
Console.WriteLine(e);
throw new Exception(e.Message);
}
}
}
}
BookChapter
using System.Text.RegularExpressions;
namespace novelDownload
{
public class BookChapter
{
public string book_id;
public string book_name;
public string chapter_num;
public string chapter_title;
public string chapter_Url;
public string chapter_content;
public override string ToString()
{
return $"{nameof(book_id)}: {book_id}\r\n{nameof(book_name)}: {book_name}\r\n{nameof(chapter_num)}: {chapter_num}\r\n{nameof(chapter_title)}: {chapter_title}\r\n{nameof(chapter_Url)}: {chapter_Url}\r\n{nameof(chapter_content)}: {chapter_content}";
}
public BookChapter() { }
public BookChapter(BookChapter bc, string fileContent)
{
bc.CopyPublicFieldTo(this);
string pattern = @"(?<=<div id=""content"" deep=""3"">)([.\S\s]*)(?=<div align=""center"">)";
var chapter_content = "";
Match result = Regex.Match(fileContent, pattern);
if (result.Success)
{
chapter_content = result.Value; //此为匹配出的值
pattern = @"(?<=<p>)([.\S\s]*)(?=<\/p>)";
chapter_content = Regex.Replace(chapter_content, pattern, "");
pattern = @" ";
chapter_content = Regex.Replace(chapter_content, pattern, "\t");
pattern = @"<br\/><br\/>";
chapter_content = Regex.Replace(chapter_content, pattern, "\r\n");
pattern = @"<p><\/p>";
chapter_content = Regex.Replace(chapter_content, pattern, "");
pattern = @"^\s*\n";
chapter_content = Regex.Replace(chapter_content, pattern, "");
pattern = @"(未完待续……)";
chapter_content = Regex.Replace(chapter_content, pattern, "");
this.chapter_content = chapter_content.Trim();
}
}
}
}
NovelBooks
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace novelDownload
{
public class NovelBooks
{
private string mainIp = "https://www.网址.com/";
public static string dbPath = @"D:\DB\Demo.db3";
List<NovelBook> list = new List<NovelBook>();
public NovelBooks(string file_content, TextBox textBox2 = null)
{
var start = "<div class=\"l\">";
var end = "<div class=\"page_b page_b2\">";
var re = file_content.PickUpCenter(start, end);
textBox2?.AppendShow(re);
start = "<li>";
string[] sArr = re.Split(new string[] { start }, StringSplitOptions.None);
foreach (var VARIABLE in sArr)
{
textBox2?.AppendShow(VARIABLE);
textBox2?.AppendShow("==================");
}
for (int i = 0; i < sArr.Length; i++)
{
var item = sArr[i];
NovelBook book = new NovelBook();
string pattern = @"<span class=""s1"">\[(.+)\]<\/span>";
book.type = Regex.Match(item, pattern).Groups[1].Value;
pattern = @"<span class=""s2""><a href=""\/\d+\/\d+\/"" target=""_blank"">(.+)<\/a><\/span>";
book.book_name = Regex.Match(item, pattern).Groups[1].Value;
pattern = @"<span class=""s2""><a href=""(.+)"" target=""_blank"">";
book.book_url = mainIp + Regex.Match(item, pattern).Groups[1].Value;
pattern = @"https:\/\/www.网址.com\/\d+\/(\d+)\/";
book.book_id = Regex.Match(book.book_url, pattern).Groups[1].Value;
pattern = @"<span class=""s4""><a href=""(.+)"">";
book.authorUrl = mainIp + Regex.Match(item, pattern).Groups[1].Value;
pattern = @"<span class=""s4""><a href=""\/author\/\d+\/"">(.+)<\/a><\/span>";
book.author = Regex.Match(item, pattern).Groups[1].Value;
textBox2?.AppendShow(book.ToString() + "\r\n");
bool isHasDate = book.IsNoEmpty();
if (isHasDate) list.Add(book);
}
}
static SQLiteHelper CreatDb(string db_Table_Name)
{
var sqlite = new SQLiteHelper(dbPath);
var colNames = new string[]
{
"book_id", "db_addDate", "book_name", "type", "book_url", "author", "authorUrl", "description",
"lastUpdateTime", "latest_chapter_name", "all_chapter_name","page_source",
};
var colTypes = Enumerable.Repeat("TEXT", colNames.Length).ToArray();
sqlite.CreateTable(db_Table_Name, colNames, colTypes);
return sqlite;
}
public static void All_Down_mian(string db_Table_Name)
{
var sqlite = CreatDb(db_Table_Name);
string ip_forward = "https://www.网址.com/xianxia_";
string ip_center = "";
string ip_end = ".html";
string ip = "";
for (int i = 1; i < 188; i++)
{
ip_center = i.ToString();
ip = ip_forward + ip_center + ip_end;
object[] objectArray = new object[3];//这里的2就是改成你要传递几个参数
objectArray[0] = ip;
objectArray[1] = sqlite;
objectArray[2] = db_Table_Name;
object param = (object)objectArray;
ThreadPool.QueueUserWorkItem(
p =>
{
// 将object转成数组
object[] objArr = (object[])param;
var param1 = (string)objArr[0];
var param2 = (SQLiteHelper)objArr[1];
var param3 = (string)objArr[2];
Down(param1, fileContent =>
{
var books = new NovelBooks(fileContent);
param2.BatchInsertPublicFields(param3, books.list);
});
}, param);
}
}
public static async Task Down(string ip = "https://www.cnblogs.com", Action<string> func = null)
{
NovelBookDetail.Log_callback("down ip:" + ip + "\r\n");
var client = new HttpClient();
string response = await client.GetStringAsync(ip);
if (func != null)
{
func(response);
}
//Console.WriteLine(response);
}
}
}
NovelBook
public class NovelBook
{
public string book_id;
public string db_addDate;
public string book_name;
public string type;
public string book_url;
public string author;
public string authorUrl;
public string description;// 简介
public string lastUpdateTime; //最后一次更新时间
public string latest_chapter_name; //最后一次章节名称
public string all_chapter_name; //所有章节名称
public string page_source; //所有章节名称
}