利用goquery爬取文章及目录,结合electron实现极简阅读界面

计划实现功能一览:

  • 输入 & 综合检索 & 把所有源符合结果显示到页面
  • 点击后展示梗概&目录
  • 点击目录导航栏获取章节内容(目录导航栏转到对应章节)
  • 预先加载上下文(去掉手动点击,自动加载下一章)
  • 书架-移动分组(TODO 未添加过的情况) initMenu()
  • 保存进度(哪个链接 高度几何) save()
  • 缓存(避免反复请求同一章内容)
  • 兼容上下章快捷键

本次实现功能:根据设定好的书源规则爬取网络文章的目录&内容,显示到前端页面中;设置目录导航栏,点击章节名称后加载对应章节内容

存储信息:
searchRules.json
id用于绑定书籍,url为网站首页,body和chapter分别为要爬取的内容标签和章节标签,dir、last、next为切换章节目录的a标签内的内容
eg.

[
    {
        "id":1,
        "name":"啃书虎",
        "url":"http://www.kenshuhu.com",
        "body":"#nr",
        "chapter": ".panel-body",
        "dir":"返回目录",
        "last":"上一篇",
        "next":"下一篇"
    },
    {
        "id":2,
        "name":"666文学",
        "url":"https://www.6666xsw.com",
        "body":"#content",
        "chapter": "#chapterlist",
        "dir":"回目录",
        "last":"<<上一章",
        "next":"下一章>>"
    }
]

readingHistory.json
之前只存储了本地书籍的阅读记录,为兼容网络书籍,需要额外添加四个字段:
readUrl 当前章节的起始url(由于一章有时会被分成多页,这里只保存第一章的第一页的url),用于保存阅读进度,规定下次打开时默认加载的起始章节
searchId 对应的书源规则id
以上两个为必要字段,下面两个是为了兼容书架和目录规则设计,用online区分打开方式(获取本地书籍/网络书籍),dirId写死为-1,不参与自定义目录规则

	"online": true,
	"dirId": -1

eg.

[
	{
		"fileName": "仙武帝尊",
		"fontSize": "13px",
		"readUrl": "/txt/32408/12111702.html",
		"readHeight": 0.28829485913402136,
		"readTime": 1710899455542,
		"searchId": 2,
		"groupId": -1,
		"bookName": "仙武帝尊",
		"bookPath": "/txt/32408/",
		"online": true,
		"dirId": -1
	},
	{
		"fileName": "《文章2end》作者:213.txt",
		"fontSize": "13px",
		"readHeight": 0.6650011217271959,
		"readTime": 1710897281781,
		"dirId": 1,
		"groupId": 3,
		"bookName": "文章2end",
		"bookPath": "D:/Go/src/practice/playv3/donut-client/userData/testReader/未分类/《文章2end》作者:213.txt"
	}
]

后端代码:
引入包 “github.com/PuerkitoBio/goquery”,框架为gin

// indexStoryHandleFunc 小说章节目录
// @Summary 小说章节目录
// @Description 小说章节目录
// @Tags 查找小说
// @Param url formData string true "小说首页"
// @Param chapter formData string true "章节标签"
// @Router /story/chapters [POST]
func indexStoryHandleFunc(c *gin.Context) {
	url, _ := c.GetPostForm("url")
	chapter, _ := c.GetPostForm("chapter")
	type Chapter struct {
		Name string `json:"name"`
		Href string `json:"href"`
	}
	list := make([]Chapter, 0)
	doc, _ := goquery.NewDocument(url)
	doc.Find(chapter).Find("ul").Find("li").Each(func(i int, selection *goquery.Selection) {
		href, _ := selection.Find("a").Attr("href")
		name := selection.Find("a").Text()
		list = append(list, Chapter{
			Name: name,
			Href: href,
		})
	})

	//返回数据
	c.JSON(http.StatusOK, gin.H{
		"body": list,
	})
}

// pageStoryHandleFunc 当前页
// @Summary 当前页
// @Description 当前页
// @Tags 查找小说
// @Param url formData string true "网站首页"
// @Param start formData string true "当前页"
// @Param end formData string true "截至章"
// @Param body formData string true "内容标签"
// @Param dir formData string true "目录"
// @Param last formData string true "上一页"
// @Param next formData string true "下一页"
// @Router /story/page [POST]
func pageStoryHandleFunc(c *gin.Context) {
	url, _ := c.GetPostForm("url")
	start, _ := c.GetPostForm("start")
	end, _ := c.GetPostForm("end")
	body, _ := c.GetPostForm("body")
	dir, _ := c.GetPostForm("dir")
	last, _ := c.GetPostForm("last")
	next, _ := c.GetPostForm("next")
	//读取文件
	text, dirUrl, lastUrl, nextUrl := goSearch(url+start, body, dir, last, next)
	for i := 0; i < 9; i++ {
		if end == nextUrl {
			break
		}
		var text0 string
		text0, dirUrl, lastUrl, nextUrl = goSearch(url+nextUrl, body, dir, last, next)
		text = text + "<br><br>" + text0
	}
	//返回数据
	c.JSON(http.StatusOK, gin.H{
		"body":    text,
		"dirUrl":  dirUrl,
		"lastUrl": lastUrl,
		"nextUrl": nextUrl,
	})
}
func goSearch(url, body, dir, last, next string) (text, dirUrl, lastUrl, nextUrl string) {
	var err error
	doc, _ := goquery.NewDocument(url)

	doc.Find(body).Each(func(i int, selection *goquery.Selection) {
		text, err = selection.Html()
		if err != nil {
			fmt.Println("selection.Html()出错:", err)
			return
		}
	})
	doc.Find("a").Each(func(i int, s *goquery.Selection) {
		txt := s.Text()
		if txt == dir {
			href, exists := s.Attr("href")
			if exists {
				dirUrl = href
			}
		}
		if txt == last {
			href, exists := s.Attr("href")
			if exists {
				lastUrl = href
			}
		}
		if txt == next {
			href, exists := s.Attr("href")
			if exists {
				nextUrl = href
			}
		}
	})
	return
}

前端代码:

// 网络源规则路径
const searchPath =path.join(dataPath,"searchRules.json");

// 获取网络小说内容/阅读进度/目录
function getOnlineNovel(novelHistory) {
    // 排序
    //获取阅读记录
    let readData = JSON.parse(fs.readFileSync(readPath, 'utf8'));
    //按时间倒叙
    let sortData = sortJsonArr(readData, "desc", "readTime")
    //保存到本地json
    fs.writeFileSync(readPath, JSON.stringify(sortData,null, '\t'));

    //获取源规则
    let source
    let searchRules = JSON.parse(fs.readFileSync(searchPath, 'utf8'));
    for (let i = 0; i < searchRules.length; i++) {
        if (novelHistory.searchId===searchRules[i].id){
            source=searchRules[i];
            break;
        }
    }

    let webview1 = document.getElementById('content');
    webview1.innerHTML=''

    //获取目录
    let options2 = {
        url:"http://localhost:8708/story/chapters",
        data: {
            "url": source.url+novelHistory.bookPath,
            "chapter":source.chapter,
        },
        method: 'post',
        headers:{
            'Content-Type':'multipart/form-data'
        }
    }
    //发送请求,获取数据并渲染
    axios(options2).then(
        (res2)=>{
            chapters=res2.data.body
            // 目录规则设为-1
            dirId=-1
            let id ='chapter'+ dirId

            // 目录容器
            let dixDiv=document.getElementById('fix-dir')
            //右侧菜单
            let dirRule = document.createElement("div");
            dirRule.id = 'chapter-online';
            dirRule.className='dir-rule'
            dirRule.style.display='none';
            //头部
            let dirTop = document.createElement("div");
            dirTop.innerText='目录'
            dirTop.className='dir-head'
            dirRule.appendChild(dirTop)
            //目录
            let dirgroup = document.createElement("div");
            dirgroup.className='dir-group'

            // 遍历章节名
            for (let j = 0; j < chapters.length; j++) {
                // 内容添加空的div
                let empty = document.createElement("div");
                empty.className=id
                let maodian = id +'-'+ j
                empty.id=maodian
                // 添加章节名&点击事件
                let chapterName = document.createElement("p");
                chapterName.innerText=chapters[j].name
                chapterName.style.cursor="pointer"
                chapterName.onclick = function (){
                    if (j===chapters.length-1){
                        getOneChapter(chapters[j].name, chapters[j].href, source, novelHistory.bookPath, maodian)
                    }else {
                        getOneChapter(chapters[j].name, chapters[j].href, source, chapters[j+1].href, maodian)
                    }
                }
                empty.appendChild(chapterName)
                webview1.appendChild(empty)
                //目录
                let dir = document.createElement("p");
                dir.id = id +'+'+ j
                dir.innerText = chapters[j].name
                dir.title = chapters[j].name
                dir.onclick = function () {
                    console.log(document.getElementById(maodian).innerText)
                    // 如果为空就发请求获取内容
                    if (document.getElementById(maodian).innerText===chapters[j].name){
                        // 已经是最后一章
                        if (j===chapters.length-1){
                            getOneChapter(chapters[j].name, chapters[j].href, source, novelHistory.bookPath,maodian)
                        }else {
                            getOneChapter(chapters[j].name, chapters[j].href, source, chapters[j+1].href,maodian)
                        }
                    }
                    document.getElementById(maodian).scrollIntoView();
                }
                dirgroup.appendChild(dir)
                //首先应加载的章节
                if (novelHistory.readUrl.replace(source.url,'')===chapters[j].href){
                    //已经是最后一章
                    if (j===chapters.length-1){
                        getOneChapter(chapters[j].name, novelHistory.readUrl, source, novelHistory.bookPath,maodian)
                    }else {
                        getOneChapter(chapters[j].name, novelHistory.readUrl, source, chapters[j+1].href,maodian)
                    }
                }
            }
            dirRule.appendChild(dirgroup)
            dixDiv.appendChild(dirRule)

            //清空主菜单
            mainMenu = new Menu();
            //添加到菜单组
            mainMenu.append(menuShelf)
            mainMenu.append(new MenuItem(
                {
                    label:'目录',
                    click: function () {
                        // 显示右侧目录内容
                        let dir=document.getElementById('chapter-online')
                        dir.style.display='block';
                        if (document.body.style.backgroundColor==='rgba(255, 255, 255, 0)'){
                            dir.style.backgroundColor='#ffffff';
                        }else {
                            dir.style.backgroundColor=document.body.style.backgroundColor;
                        }

                        // 目录导航栏转到对应章节
                        let chapters = document.getElementsByClassName(id);
                        let currentIndex = getCurrentChapter(chapters);
                        let dirMaodian = id +'+'+ currentIndex;
                        console.log("dirMaodian"+dirMaodian+"  currentIndex"+currentIndex)
                        document.getElementById(dirMaodian).scrollIntoView();

                        //失焦隐藏右侧目录
                        webview1.addEventListener('click', function() {
                            document.getElementById('chapter-online').style.display='none';
                        });
                    }
                }
            ))
            mainMenu.append(menuBg)
            mainMenu.append(menuColor)
            mainMenu.append(menuScroll)


        })
}
function getOneChapter(chapterName,start,source,end,id) {
    //判断当前章节的下一章节的链接
    let options = {
        url:"http://localhost:8708/story/page",
        data: {
            "url":source.url,
            "start": start,
            "end": end,
            "body":source.body,
            "dir":source.dir,
            "last":source.last,
            "next":source.next,
        },
        method: 'post',
            headers:{
            'Content-Type':'multipart/form-data'
        }
    }
    //发送请求,获取数据并渲染
    axios(options).then(
        (res)=>{
            // 渲染
            let nowChapter = document.getElementById(id);
            nowChapter.innerHTML='<p>'+chapterName+'</p>'+res.data.body;
            console.log(res.data)
        }
    )
}

实现效果:
实现效果
完整源码:https://gitee.com/pig-peggy/touch_fish/tree/master/

  • 4
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值