JS爬取新闻内容——初学者的历程(五)

在完成了搜索功能后,要开始尝试上次助教所说的一个网页同时具备爬虫和搜索的功能。对于这一点我作了如下的构想:
我的网页主页是一个搜索页面也就是前面已经做好的页面,然后在此基础上用CSS对页面进行美化同时加上一栏导航栏。导航栏中由主页、新闻爬取、关于这三项组成。
其中新闻爬取会打开一个新的网页其中会有三个按钮来触发爬虫(分别对应三个网站)。然后关于中会打开一个新的网页其中以文字形式附相关代码及说明。
那么首先就是要把之前构建的服务器响应的请求扩大。当然这其实很简单,只不过再多写几个类似的函数然后把爬虫代码放在其中就好。最后得到以下超长代码(个人能力有限觉得没法把三个网站的爬虫合并只能一一写,所以合起来特别长):

var mysql  = require('mysql');  
 
var connection = mysql.createConnection({     
  host     : 'localhost',       
  user     : 'root',              
  password : 'root',       
  port: '3306',                   
  database: 'crawl' 
}); 
connection.connect();
var express = require('express');
var app = express();
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
    res.sendFile(__dirname + "/public/" + "4.20.html");
})
app.get('/process_get', function(req, res) {
    // 输出 JSON 格式
     keyword=req.query.keyword;
     var sql="select url,title,keywords,publish_date,author from fetches where keywords like '%"+keyword+"%'";
     connection.query(sql,function (err, result) {
        if(err){
          console.log('[SELECT ERROR] - ',err.message);
          return;
        }
        res.setHeader('Content-Type', 'text/html; charset=utf-8');
               console.log(result);
               res.end(JSON.stringify(result,null,'<br>'));

})})

app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
    res.sendFile(__dirname + "/public/" + "4.21.html");
})
app.get('/process_get2', function(req, res) {
    // 输出 JSON 格式
    var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');

var source_name = "中国新闻网";
var domain = 'http://www.chinanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;


var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
};

seedget();

function seedget() {
    request(seedURL, function(err, res, body) { //读取种子页面
        // try {
        //用iconv转换编码
        var html = myIconv.decode(body, myEncoding);
        //console.log(html);
        //准备用cheerio解析html
        var $ = myCheerio.load(html, { decodeEntities: true });
        // } catch (e) { console.log('读种子页面并转码出错:' + e) };
        var seedurl_news;
        try {
            seedurl_news = eval(seedURL_format);
        } catch (e) { console.log('url列表所处的html块识别出错:' + e) };
        seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
            var myURL = "";
            try {
                //得到具体新闻url
                var href = "";
                href = $(e).attr("href");
                if (href == undefined) return;
                if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
                else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
                else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他

            } catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }

            if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
            //console.log(myURL);

            var fetch_url_Sql = 'select url from fetches where url=?';
            var fetch_url_Sql_Params = [myURL];
            mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('URL duplicate!')
                } else newsGet(myURL); //读取新闻页面
            });
        });
    });
};

function newsGet(myURL) { //读取新闻页面
    request(myURL, function(err, res, body) { //读取新闻页面
        //try {
        var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
        //console.log(html_news);
        //准备用cheerio解析html_news
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        //} catch (e) {    console.log('读新闻页面并转码出错:' + e);};

        console.log("转码读取成功:" + myURL);
        //动态执行format字符串,构建json对象准备写入文件或数据库
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
        //fetch.html = myhtml;
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; //编码
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); //标题

        if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   
        console.log('date: ' + fetch.publish_date);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); //来源

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要    

        // var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
        //     "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        // 存储json
        // fs.writeFileSync(filename, JSON.stringify(fetch));

        var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
            'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
        var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
            fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
            fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
        ];

        //执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
        mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
            if (qerr) {
                console.log(qerr);
            }
        }); //mysql写入
    });
}
res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.end('爬取代码成功')
})
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
    res.sendFile(__dirname + "/public/" + "4.21.html");
})
app.get('/process_get3', function(req, res) {
    // 输出 JSON 格式
     var mysql = require('./mysql.js');
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');


var source_name = "网易新闻";
var domain = 'https://news.163.com/';
var myEncoding = "GBK";
var seedURL = 'https://news.163.com/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = "$('.post_time_source').text()"|"$(#ptime).text()";
var author_format =" $('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('#endText').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format ="$('#ne_article_source').text()";
var url_reg =/\/(\d{2})\/(\d{4})\/(\d{2})\/(\w{10,30}).html/;

var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)|(\d{4}\-\d{2}\-\d{2})/

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
};

seedget();

function seedget() {
    request(seedURL, function(err, res, body) { //读取种子页面
        // try {
        //用iconv转换编码
        var html = myIconv.decode(body, myEncoding);
        //console.log(html);
        //准备用cheerio解析html
        var $ = myCheerio.load(html, { decodeEntities: true });
        // } catch (e) { console.log('读种子页面并转码出错:' + e) };
        var seedurl_news;
        try {
            seedurl_news = eval(seedURL_format);
        } catch (e) { console.log('url列表所处的html块识别出错:' + e) };
        seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
            var myURL = "";
            try {
                //得到具体新闻url
                var href = "";
                href = $(e).attr("href");
                if (href == undefined) return;
                if (href.toLowerCase().indexOf('https://') >= 0) myURL = href //http://开头的
                else if (href.toLowerCase().indexOf('http://') >= 0) myURL = href
                else if (href.startsWith('//')) myURL = 'https:' + href 开头的
                else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href //其他

            } catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }

            if (!url_reg.test(myURL)){ 
            return;} //检验是否符合新闻url的正则表达式
            //console.log(myURL);

            var fetch_url_Sql = 'select url from fetches where url=?';
            var fetch_url_Sql_Params = [myURL];
            mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('URL duplicate!')
                } else newsGet(myURL); //读取新闻页面
            });
        });
    });
};

function newsGet(myURL) { //读取新闻页面
    request(myURL, function(err, res, body) { //读取新闻页面
        //try {
        var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
        //console.log(html_news);
        //准备用cheerio解析html_news
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        //} catch (e) {    console.log('读新闻页面并转码出错:' + e);};

        console.log("转码读取成功:" + myURL);
        //动态执行format字符串,构建json对象准备写入文件或数据库
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
        //fetch.html = myhtml;
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; //编码
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); //标题

        if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   
        console.log('date: ' + fetch.publish_date);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); //来源

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要    

        // var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
        //     "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        // 存储json
        // fs.writeFileSync(filename, JSON.stringify(fetch));

        var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
            'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
        var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
            fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
            fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
        ];

        //执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
        mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
            if (qerr) {
                console.log(qerr);
            }
        }); //mysql写入
    });
}
res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.end('爬取代码成功')
})
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
    res.sendFile(__dirname + "/public/" + "4.21.html");
})
app.get('/process_get4', function(req, res) {
    // 输出 JSON 格式
    var mysql = require('./mysql.js');
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');


var source_name = "网易新闻";
var domain = 'http://www.people.com.cn/';
var myEncoding = "GBK";
var seedURL = 'http://www.people.com.cn/';

var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = " $('meta[name=\"publishdate\"]').eq(0).attr(\"content\")";
var author_format =" $('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('.box_con').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format =" $('meta[name=\"source\"]').eq(0).attr(\"content\")";
var url_reg =/\/(\w{2})\/(\d{4})\/(\d{4})\/(\w{5,10}\-\w{8,15}).html/;

var regExp = /(\d{4}\-\d{2}\-\d{2})/

//防止网站屏蔽我们的爬虫
var headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}

//request模块异步fetch url
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        //proxy: 'http://x.x.x.x:8080',
        headers: headers,
        timeout: 10000 //
    }
    myRequest(options, callback)
};

seedget();

function seedget() {
    request(seedURL, function(err, res, body) { //读取种子页面
        // try {
        //用iconv转换编码
        var html = myIconv.decode(body, myEncoding);
        //console.log(html);
        //准备用cheerio解析html
        var $ = myCheerio.load(html, { decodeEntities: true });
        // } catch (e) { console.log('读种子页面并转码出错:' + e) };
        var seedurl_news;
        try {
            seedurl_news = eval(seedURL_format);
        } catch (e) { console.log('url列表所处的html块识别出错:' + e) };
        seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
            var myURL = "";
            try {
                //得到具体新闻url
                var href = "";
                href = $(e).attr("href");
                if (href == undefined) return;
                if (href.toLowerCase().indexOf('https://') >= 0) myURL = href //http://开头的
                else if (href.toLowerCase().indexOf('http://') >= 0) myURL = href
                else if (href.startsWith('//')) myURL = 'https:' + href 开头的
                else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href //其他

            } catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }

            if (!url_reg.test(myURL)){ 
            return;} //检验是否符合新闻url的正则表达式
            //console.log(myURL);

            var fetch_url_Sql = 'select url from fetches where url=?';
            var fetch_url_Sql_Params = [myURL];
            mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('URL duplicate!')
                } else newsGet(myURL); //读取新闻页面
            });
        });
    });
};

function newsGet(myURL) { //读取新闻页面
    request(myURL, function(err, res, body) { //读取新闻页面
        //try {
        var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
        //console.log(html_news);
        //准备用cheerio解析html_news
        var $ = myCheerio.load(html_news, { decodeEntities: true });
        myhtml = html_news;
        //} catch (e) {    console.log('读新闻页面并转码出错:' + e);};

        console.log("转码读取成功:" + myURL);
        //动态执行format字符串,构建json对象准备写入文件或数据库
        var fetch = {};
        fetch.title = "";
        fetch.content = "";
        fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
        //fetch.html = myhtml;
        fetch.url = myURL;
        fetch.source_name = source_name;
        fetch.source_encoding = myEncoding; //编码
        fetch.crawltime = new Date();

        if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
        else fetch.keywords = eval(keywords_format);

        if (title_format == "") fetch.title = ""
        else fetch.title = eval(title_format); //标题

        if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   
        console.log('date: ' + fetch.publish_date);
        fetch.publish_date = regExp.exec(fetch.publish_date)[0];
        fetch.publish_date = fetch.publish_date.replace('年', '-')
        fetch.publish_date = fetch.publish_date.replace('月', '-')
        fetch.publish_date = fetch.publish_date.replace('日', '')
        fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

        if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
        else fetch.author = eval(author_format);

        if (content_format == "") fetch.content = "";
        else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定

        if (source_format == "") fetch.source = fetch.source_name;
        else fetch.source = eval(source_format).replace("\r\n", ""); //来源

        if (desc_format == "") fetch.desc = fetch.title;
        else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要    

        // var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
        //     "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
        // 存储json
        // fs.writeFileSync(filename, JSON.stringify(fetch));

        var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
            'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
        var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
            fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
            fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
        ];

        //执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
        mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
            if (qerr) {
                console.log(qerr);
            }
        }); //mysql写入
    });
}res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.end('爬取代码成功')
     })
var server = app.listen(8081, function() {
    var host = server.address().address
    var port = server.address().port
    console.log("访问地址为 http://%s:%s", host, port)
})

在把服务器更新完毕后,我们先对4.21.html也就是新闻爬虫所打开的新网页进行大概的测试,先把他简单的搞成之前的4.20.html内容复制三遍,也就是:
在这里插入图片描述
三个按钮分别对应三个请求。
然后我们用命令行打开服务器:
在这里插入图片描述
在这里插入图片描述
在对html中三个按钮一一测试。这里用第一个为例(对应中国新闻网):
点击后在服务器端我们可以看到其实就是相当于运行了我们之前的爬虫代码:
在这里插入图片描述
然后在服务器代码中我们res.end了一个”爬取代码成功“所以在浏览器上会出现:
在这里插入图片描述
这样一来我们测试就完成了。然后就是对4.20.html,4.21.html进行页面上的优化:
首先是4.20.html也就是主页面搜索页面采用了一些CSS样式并且还放上了一张个人觉得挺符合这个网页气质的背景图片后得到如下代码:

<!DOCTYPE html>
<html>
<title>新闻内容爬取关键词查询</title>
<meta charset="utf-8">
<style type="text/css">
     body{background-image: url(./back.jpg);background-repeat: fixed;background-repeat: no-repeat;background-size: 150%;}
     input.keyword{
     	position: relative;
		top: 60px;
		left: 450px;
		z-index: -1;
		padding: 14px 250px;
     }
     input.button{
     	position: relative;
     	top:150px;
     	left: 650px;
     	z-index: -1;
     	padding: 10px 100px;
     }
	 ul{
			list-style-type: none;
			margin:0;
			padding: 0;
			overflow: hidden;
			background-color: grey;
		}
		li{float: left;}
		li a{
			display: block;
			color:blue;
			text-align: center;
			padding: 14px 232px;
			text-decoration: none;
		}
		li a:hover{
			background-color: #555;
			color: red;
		}
	section{

		position: relative;
		top: 60px;
		left: 500px;
		z-index: -1;
	}
	section a{
		color: white;
		font-size: 20px;
		font-weight: bold;
	}
	section a:hover{
         color: red;
	}

		</style>
<body>
	<ul>
		<li>
			<a href="4.20.html">主页</a>
		</li>
		<li><a href="4.21.html">新闻网站</a></li>
		<li><a href="关于.html">关于</a></li>
	</ul>
	<br>
    <form action="http://127.0.0.1:8081/process_get" method="GET">
        <br> <input type="text" name="keyword" class="keyword">
        <br>
        <input type="submit" value="搜索" class="button">
    </form>
</body>

</html>

然后得到优化后的网页页面如下:
在这里插入图片描述
然后再对4.21.html也就是新闻爬虫界面进行编写,这里其实就是简单的三个按钮而已:

<!DOCTYPE html>
<html>
<title>新闻内容爬取关键词查询</title>
<meta charset="utf-8">
<style type="text/css">
     body{background-image: url(./back.jpg);background-repeat: fixed;background-repeat: no-repeat;background-size: 150%;}
input{
	   position: relative;
       left: 600px;
       padding: 20px 100px;
     }

		</style>
<body>
<br><br><br>
    <form action="http://127.0.0.1:8081/process_get2" method="GET">
       
        <input type="submit" value="爬取中国新闻网">
    </form>
    <br>
    <br>
        <form action="http://127.0.0.1:8081/process_get3" method="GET">
        <br> 
        <br>
        <input type="submit" value="爬取网易新闻  ">
    </form>
    <br>
    <br>
        <form action="http://127.0.0.1:8081/process_get4" method="GET">
        <br> 
        <br>
        <input type="submit" value="爬取人民网    ">
    </form>


</body>

然后呈现页面如下:
在这里插入图片描述
分别点击按钮后的效果在上面测试中已经呈现。
然后关于.html中给出爬虫代码,呈现如下页面:
在这里插入图片描述
综上这个项目我个人觉得就算完成的差不多了吧。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
提供的源码资源涵盖了安卓应用、小程序、Python应用和Java应用等多个领域,每个领域都包含了丰富的实例和项目。这些源码都是基于各自平台的最新技术和标准编写,确保了在对应环境下能够无缝运行。同时,源码中配备了详细的注释和文档,帮助用户快速理解代码结构和实现逻辑。 适用人群: 这些源码资源特别适合大学生群体。无论你是计算机相关专业的学生,还是对其他领域编程感兴趣的学生,这些资源都能为你提供宝贵的学习和实践机会。通过学习和运行这些源码,你可以掌握各平台开发的基础知识,提升编程能力和项目实战经验。 使用场景及目标: 在学习阶段,你可以利用这些源码资源进行课程实践、课外项目或毕业设计。通过分析和运行源码,你将深入了解各平台开发的技术细节和最佳实践,逐步培养起自己的项目开发和问题解决能力。此外,在求职或创业过程中,具备跨平台开发能力的大学生将更具竞争力。 其他说明: 为了确保源码资源的可运行性和易用性,特别注意了以下几点:首先,每份源码都提供了详细的运行环境和依赖说明,确保用户能够轻松搭建起开发环境;其次,源码中的注释和文档都非常完善,方便用户快速上手和理解代码;最后,我会定期更新这些源码资源,以适应各平台技术的最新发展和市场需求。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值