抖音视频评论区提取工具的数据说明记录
DY视频评论监控提取
客户端截图:
一:系统采用的开发工具
系统采用C#进行开发,包括软件版本客户端,电脑网页端、和手机端。
服务端包括采集端和数据库。
二:关机能否继续监控和采集
建立好任务后,关闭软件或者是电脑也可进行评论采集。打开软件后可看到最新的评论对应数据。
三:提取到的评论为什么有的评论时间是00
提取的评论原始时间不是准确时间,如果是24小时内的会显示 “几分钟前、几小时前”这种的都是有评论的具体时间。超过24小时的原始时间就会显示“几天前、几周前、几个月前、半年前等”。这样系统就会按照当前的日期进行推算。推算出来的时间 只有日期 没有时间 所以时间是00
四:系统都可以监控和采集什么类型的模块评论
系统可以采集抖音的类型有 1:关键词评论 2:博主评论 3:单视频评论。并且支持设置采集的视频的时间范围 并且支持监控。
五:采集到评论都包含什么内容
采集到的评论内容包含 1:评论人昵称 2:评论视频名称 3:视频作者 4:视频地址 5:作者地址 6:评论人地址 (MS4地址) 7:评论内容 8:命中词语 9:抖音号 10:评论人二维码
六:采集代码流程的顺序
这里只记录 关键词代码采集顺序思路和服务端源代码
思路
采集第一步:获取任务关键词和筛选条件,筛选条件包括不限 、一天内、一周内、半年内
采集第二步:获取关键词,链接对应搜索地址,找到筛选条件的抖音视频平台的层进行操作 然后开始搜索数据。
采集第三步:开始自动下拉并且循环列表,提取视频ID
采集第四步:获取提取到的关键词视频ID,拼接视频地址,然后获取视频评论层进行自动操作、然后自动下拉获取评论层,循环获取评论列表即可。(自行解析即可)
代码如下:
private void list_ceng()
{
//int jiazai1 = 0;
//int xiala_int = 0;
//int duqu_dat = 0;//读取数据库
//int xunhuan = 0;
//dianji_count = 0;
//int dianji_count1 = 0;
jiazai1 = 0;
xiala_int = 0;
duqu_dat = 0;//读取数据库
xunhuan = 0;
dianji_count1 = 0;
dianji_count = 0;
dianji_count1 = 0;
dianji_count = 0;
xiala_int = 0;
jiazai1 = 0;
duqu_dat = 0;
yichang = 0;
label2.Text = "";
label7.Text = "";
while (true)
{
int panduan_count = 0;
close_ceng();
close_ceng_pinglun();
try
{
if (listBox1.Items.Count > 0)
{
if (duqu_dat == 0)
{
t_rizhi.Text = "获取到视频ID 数据库ID";
int v_id = Convert.ToInt32(listBox1.Items[0].ToString());
edit_id = v_id;
try
{
t_rizhi.Text = "开始获取数据库ID 数据库ID";
//需要在获取一个 提取视频的
try
{
SqlCommand comm1 = new SqlCommand("select username,xnumbers,video_id from video_id where id=" + v_id + " ", conn1);
conn1.Open();
SqlDataReader dr1 = comm1.ExecuteReader();
if (dr1.Read())
{
username = dr1.GetString(0).Trim();
xnumbers = dr1.GetString(1).Trim();
video_id = dr1.GetString(2).Trim();
mp4_url = "https://www.douyin.com/video/" + video_id.Trim();
video_url = mp4_url.Trim();
}
conn1.Close();
duqu_dat = 1;
}
catch
{
conn1.Close();
}
}
catch
{
conn1.Close();
duqu_dat = 0;
}
}
panduan aa_count = new panduan();
panduan_count = Convert.ToInt32(aa_count.panduan_piliang_1(xnumbers.Trim()));
if (panduan_count == 1)
{
if (jiazai == 0)
{
listView1.Items.Clear();
chromeBrowser2 = new ChromiumWebBrowser("https://www.douyin.com/channel/300203?modal_id=" + video_id.Trim() + "");
chromeBrowser2.Dock = DockStyle.Fill;
panel2.Invoke(new MethodInvoker(() =>
{
panel2.Controls.Add(chromeBrowser2);
}));
}
else
{
if (jiazai1 == 0)
{
t_rizhi.Text = "开始加载页面";
listView1.Items.Clear();
chromeBrowser2.Load("https://www.douyin.com/channel/300203?modal_id=" + video_id.Trim() + "");
}
}
if (!chromeBrowser2.IsBrowserInitialized || chromeBrowser2.IsLoading)
{
t_rizhi.Text = "页面加载中";
jiazai = 1;
jiazai1 = 1;
// 控件未加载完毕
}
else
{
t_rizhi.Text = "页面加载完毕";
string wangzhi = chromeBrowser2.Address.Trim();
if (wangzhi.Contains(video_id.Trim()))
{
t_rizhi.Text = "地址和当前匹配";
string dianji_html = chromeBrowser2.GetSourceAsync().Result;
if (dianji_html.Contains("KqyADpcN tvnVKTp7 Qdw5P5DX"))
// if (dianji_html.Contains("HV3aiR5J comment-mainContent iV2CcAAV"))
{
//else
//{
// if (dianji_count == 1)
// {
// t_rizhi.Text = "点击层1";
// dianji_count = 0;
// dianji_count1 = dianji_count1 + 1;
// if (dianji_count1 >= 10)
// {
// dianji_count = 0;
// }
// }
//}
// else
// {
#region///用于点击评论
if (dianji_count == 0)
{
t_rizhi.Text = "点击层";
Thread.Sleep(1000);
chromeBrowser2.ExecuteScriptAsync(@"
var divElement = document.querySelector('div.jp8u3iov');
if (divElement) {
divElement.click();
}
"); close_ceng();
close_ceng_pinglun();
dianji_count = 1;
label7.Text = dianji_count.ToString();
dianji_count1 = dianji_count1 + 1;
}
}
#endregion
xunhuan = xunhuan + 1;
// xunhuan = xunhuan + 1;
jiazai = 1;
jiazai1 = 1;
//加载完毕 下拉控件
if (xiala_int == 0)
{
xiala_int = 1;
}
if (xiala_int == 1)
{
// Thread.Sleep(2000);
t_rizhi.Text = "下拉层";
//Thread.Sleep(2000);
xiala_ceng();
}
string htmlContent = chromeBrowser2.GetSourceAsync().Result;
jieshu = htmlContent;
//t_rizhi.Text = "获取视频名称";
//title_ceng(htmlContent);
jiexi_title(htmlContent);
//t_rizhi.Text = "获取作者名称";
//zuozhe_ceng(htmlContent);
//t_rizhi.Text = "获取视频日期";
//shipin_dates_ceng(htmlContent);
//t_rizhi.Text = "获取作者地址";
//zuozhe_url_ceng(htmlContent);
if (jieshu.Contains("暂时没有更多评论") || xunhuan >= Convert.ToInt32(t_xunhuan.Text))
{
close_ceng();
close_ceng_pinglun();
if (xunhuan >= Convert.ToInt32(t_xunhuan.Text))
{
string xxx = "1";
conn1.Close();
try
{
}
catch
{
conn1.Close();
}
}
t_rizhi.Text = "获取视频名称";
title = ceng_jiexi.title_ceng(htmlContent);
//title_ceng(htmlContent);
// jiexi_title(htmlContent);
t_rizhi.Text = "获取作者名称";
title = ceng_jiexi.title_ceng(htmlContent);
zuozhe_name = ceng_jiexi.zuozhe_ceng(htmlContent);
//zuozhe_ceng(htmlContent);
t_rizhi.Text = "获取视频日期";
shipin_dates = ceng_jiexi.shipin_shijian_ceng(htmlContent);
// shipin_dates_ceng(htmlContent);
t_rizhi.Text = "获取作者地址";
zuozhe_url = ceng_jiexi.zuozhe_url_ceng(htmlContent);
// zuozhe_url_ceng(htmlContent);
// shipin_pingliun1(htmlContent);
int xunhuan_count = 0;
//开始搜索数据
string mingzhong_key1 = "";
try
{
t_rizhi.Text = "获取关键词";
SqlCommand comm_mingzhong = new SqlCommand("select mingzhong_key from mission where xnumbers='" + xnumbers.Trim() + "' ", conn1);
conn1.Open();
listBox2.Items.Clear();
SqlDataReader dr_mingzhong = comm_mingzhong.ExecuteReader();
if (dr_mingzhong.Read())
{
mingzhong_key1 = dr_mingzhong.GetString(0).Trim();
}
conn1.Close();
string[] items = mingzhong_key1.Split(',');
foreach (string item in items)
{
// 添加非空的子字符串
if (!string.IsNullOrWhiteSpace(item))
{
listBox2.Items.Add(item.Trim());
}
}
}
catch
{
conn1.Close();
}
x_pinglun = DateTime.Now.ToString("yyyyMMddhhmmssffff") + DateTime.Now.Second.ToString();
t_rizhi.Text = "获取视频评论";
shipin_pingliun1(htmlContent);
// MessageBox.Show("评论获取完成");
//if (jieshu.Contains("暂时没有更多评论"))
// {
xiala_int = 0;
jiazai1 = 0;
yichang = 0;
label2.Text = "";
try
{
t_rizhi.Text = "更新数据";
SqlCommand comm_gengxin = new SqlCommand("update video_id set li=1 where id=" + edit_id + "", conn1);
conn1.Open();
comm_gengxin.ExecuteNonQuery();
conn1.Close();
//chromeBrowser2.ExecuteScriptAsync("document.documentElement.innerHTML = '';");
//chromeBrowser2.Load("about:blank");
listBox1.Items.RemoveAt(0);
xunhuan = 0;
dianji_count = 0;
dianji_count1 = 0;
label7.Text = "";
string cachePath = t_path.Text + "/Cache";//Path.Combine(Path.GetDirectoryName(chromeBrowser2.GetBrowser().GetHost().RequestContext.CachePath), "Cache");
try
{
Directory.Delete(cachePath, true);
Console.WriteLine("缓存已清除");
}
catch (Exception ex)
{
Console.WriteLine("清除缓存时出错:" + ex.Message);
}
}
catch
{
conn1.Close();
//MessageBox.Show("827错误");
}
duqu_dat = 0;
//}
}
if (jieshu.Contains("暂无评论"))
{
jiazai1 = 0;
xiala_int = 0;
duqu_dat = 0;//读取数据库
xunhuan = 0;
dianji_count1 = 0;
dianji_count = 0;
dianji_count1 = 0;
dianji_count = 0;
xiala_int = 0;
jiazai1 = 0;
duqu_dat = 0;
yichang = 0;
label2.Text = "";
label7.Text = "";
try
{
SqlCommand comm_gengxin = new SqlCommand("update video_id set li=1 where id=" + edit_id + "", conn1);
conn1.Open();
comm_gengxin.ExecuteNonQuery();
conn1.Close();
//chromeBrowser2.ExecuteScriptAsync("document.documentElement.innerHTML = '';");
//chromeBrowser2.Load("about:blank");
listBox1.Items.RemoveAt(0);
video_id = "";
//string cachePath = t_path.Text;
string cachePath = t_path.Text + "/Cache"; //Path.Combine(Path.GetDirectoryName(chromeBrowser2.GetBrowser().GetHost().RequestContext.CachePath), "Cache");
try
{
Directory.Delete(cachePath, true);
Console.WriteLine("缓存已清除");
}
catch (Exception ex)
{
Console.WriteLine("清除缓存时出错:" + ex.Message);
}
}
catch
{
conn1.Close();
}
}
//Thread.Sleep(TimeSpan.FromSeconds(3));
}
else
{
jiazai = 1;
jiazai1 = 1;
}
}
}
else
{
jiazai1 = 0;
xiala_int = 0;
duqu_dat = 0;//读取数据库
xunhuan = 0;
dianji_count1 = 0;
dianji_count = 0;
dianji_count1 = 0;
dianji_count = 0;
xiala_int = 0;
jiazai1 = 0;
duqu_dat = 0;
yichang = 0;
label2.Text = "";
label7.Text = "";
// chromeBrowser2.Load("about:blank");
listBox1.Items.RemoveAt(0);
}
}
}
catch
{
jiazai1 = 0;
xiala_int = 0;
duqu_dat = 0;//读取数据库
xunhuan = 0;
dianji_count1 = 0;
dianji_count = 0;
static string GetSrcFromSourceTag(string input)
{
Regex regex = new Regex(@"<source[^>]+src\s*=\s*""([^""]+)""");
Match match = regex.Match(input);
if (match.Success)
{
return match.Groups[1].Value;
}
else
{
return null; // 或者抛出异常,视情况而定
}
}
public string title_ceng1(string html)
{
string title_b = "";
// 使用正则表达式提取 content 属性的值
// string pattern = "";
string pattern = @"<meta\s+name=""lark:url:video_title""\s+content=""([^""]+)""";
Match match = Regex.Match(html, pattern);
// if (match.Success)
//{
// 获取匹配到的 content 属性值
string contentValue = match.Groups[1].Value;
title_b = contentValue;
//if (title_b.Trim() == "抖音-记录美好生活")
//{
// title_b = "";
//}
Console.WriteLine("Content Value: " + contentValue);
// }
// else
// {
//title_b = "没有数据";
// Console.WriteLine("No meta tag found or content attribute not present.");
// }
return title_b;
}//视频名称
public string title_ceng(string html)
{
string title_b = "";
string htmlContent = html;
try
{
// string title = "";//获取title值 标题 视频名称\
//arnSiSbK hT34TYMB ONzzdL2F
Regex regex = new Regex(@"<span class=""arnSiSbK hT34TYMB ONzzdL2F"">(.*?)</span>", RegexOptions.IgnoreCase);
// Regex regex = new Regex(@"<span class=""cx3p4vL2 NhsqQqNv BWrTO7Je"">(.*?)</span>", RegexOptions.IgnoreCase);
Match match = regex.Match(htmlContent);
if (match.Success)
{
// 获取匹配到的第一个组(即<title>和</title>之间的内容)
title_b = match.Groups[1].Value;
//发布时间:
title_b = title_b.Replace("<span>", "");
title_b = title_b.Replace("/", "");
string pattern = @"<[^>]*>";
title_b = Regex.Replace(title_b, pattern, string.Empty);
}
}
catch
{
//MessageBox.Show("608");
}
return title_b;
}
public string shipin_shijian_ceng(string html)
{
string shipin_dates = "";
string htmlContent = html;
try
{
// string title = "";//获取title值 标题 视频名称
Regex regex = new Regex(@"<span class=""time"">(.*?)</span>", RegexOptions.IgnoreCase);
Match match = regex.Match(htmlContent);
if (match.Success)
{
// 获取匹配到的第一个组(即<title>和</title>之间的内容)
shipin_dates = match.Groups[1].Value.Trim();
//发布时间:
shipin_dates = shipin_dates.Replace("<span>", "");
shipin_dates = shipin_dates.Replace("/", "");
shipin_dates = shipin_dates.Replace("·", "");
shipin_dates = shipin_dates.Replace("日", "");
shipin_dates = shipin_dates.Replace("年", "-");
shipin_dates = shipin_dates.Replace("月", "-");
string day = "";
// try
// {
Regex yearRegex = new Regex(@"\b\d{4}\b");
Regex dateRegex = new Regex(@"\b\d{1,2}-\d{1,2}\b");
// 判断字符串中是否包含年份信息
if (yearRegex.IsMatch(shipin_dates.Trim()))
{
// Console.WriteLine("输入字符串包含年份信息");
}
else if (dateRegex.IsMatch(shipin_dates.Trim()))
{
// Console.WriteLine("输入字符串不包含年份信息,但包含日期信息");
shipin_dates = "2024-" + shipin_dates.Trim();
}
else
{
Console.WriteLine("输入字符串既没有年份信息,也不符合日期格式");
#region
//DateTime shipin_dates_y = Convert.ToDateTime(shipin_dates);
//if (shipin_dates_y.Year != 1)
//{
// shipin_dates = "2004-" + shipin_dates.Trim ();
// Console.WriteLine("这个日期变量包含年份。");
//}
//else
//{
// shipin_dates = "2004-" + shipin_dates.Trim ();
// //Console.WriteLine("这个日期变量不包含年份。");
//}
#endregion
// }
// catch
// {
char delimiter = '·';
int index1 = shipin_dates.IndexOf(delimiter);
if (index1 != -1)
{
string textBeforeDelimiter = shipin_dates.Substring(0, index1);
shipin_dates = textBeforeDelimiter;
Console.WriteLine("Text before delimiter: " + textBeforeDelimiter);
}
if (shipin_dates.Contains("天"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("天");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddDays(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToShortDateString();
}
if (shipin_dates.Contains("月"))
{
//pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("月");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddMonths(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToShortDateString();
}
if (shipin_dates.Contains("小时"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("小时");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddHours(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToString();
}
if (shipin_dates.Contains("分钟"))
{
//pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("分钟");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddMinutes(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToString();
}
if (shipin_dates.Contains("周"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("周");
day = shipin_dates.Substring(0, index);
int week = (Convert.ToInt32(day) * 7);
DateTime dt = DateTime.Now.Date.AddDays(-Convert.ToInt32(week));
shipin_dates = dt.ToShortDateString();
}
if (shipin_dates.Contains("年"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("年");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.AddYears(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToShortDateString();
}
//判断当前时间是否和视频时间 是否大于
DateTime a = DateTime.Now; // 当前时间
DateTime b = DateTime.ParseExact(shipin_dates, "yyyy-MM-dd", System.Globalization.CultureInfo.InvariantCulture);// 视频时间,假设为 2022-05-27
TimeSpan interval = a - b; // 计算时间间隔
if (Math.Abs(interval.TotalDays) <= 730) // 判断时间间隔是否小于等于两年//这个里面的值 通过字段获取
{
Console.WriteLine("视频时间和当前时间在两年内");
}
else
{
Console.WriteLine("视频时间和当前时间不在两年内");
}
}
}
}
catch
{
//MessageBox.Show("608");
}
return shipin_dates;
}//视频时间
public string zuozhe_ceng(string html)
{
string zuozhe = "";
string htmlContent = html;
try
{
// string title = "";//获取title值 标题 视频名称\
// Regex regex = new Regex(@"<div class=""account-name"">(.*?)</div>", RegexOptions.IgnoreCase);
//arnSiSbK ypGAC_xH ONzzdL2F
Regex regex = new Regex(@"<span class=""arnSiSbK ypGAC_xH ONzzdL2F"">(.*?)</span>", RegexOptions.IgnoreCase);
//Regex regex = new Regex(@"<span class=""cx3p4vL2 JO0TrMZd BWrTO7Je"">(.*?)</span>", RegexOptions.IgnoreCase);
Match match = regex.Match(htmlContent);
if (match.Success)
{
// 获取匹配到的第一个组(即<title>和</title>之间的内容)
zuozhe = match.Groups[1].Value;
//发布时间:
zuozhe = zuozhe.Replace("<span>", "");
zuozhe = zuozhe.Replace("/", "");
}
}
catch
{
//MessageBox.Show("608");
}
return zuozhe;
}//视频作者
public string zuozhe_url_ceng(string html)
{
string zuozhe_url = "";
string htmlContent = html;
//B0JKdzQ8 sVGJfEdt KsoclCZj
string pattern = "<div class=\"B0JKdzQ8 sVGJfEdt KsoclCZj\">.*?href=\"([^\"]+)\"";
// string pattern = "<div class=\"z726S1Si NxuHjHJZ vog0e6l7\">.*?href=\"([^\"]+)\"";
Regex regex = new Regex(pattern);
Match match = regex.Match(html);
if (match.Success)
{
string href = match.Groups[1].Value;
zuozhe_url = href;
}
return zuozhe_url;
}
dianji_count1 = 0;
dianji_count = 0;
xiala_int = 0;
jiazai1 = 0;
duqu_dat = 0;
yichang = 0;
label2.Text = "";
label7.Text = "";
// chromeBrowser2.Load("about:blank");
// MessageBox.Show("958");
}
if (textBox1.Text != "0")
{
Thread.Sleep(TimeSpan.FromSeconds(Convert.ToInt32(textBox1.Text)));
}
}
}