前言
记录下学习过程中关于主题 爬虫,本文主要利用的是XPATH来实现分析一个网页中的html码,
同时在一个一级链接下进行关键字查找,爬取特定关键词的
url
标题
来源
发布时间
内容
代码如下:
#!/usr?bin/env/ python
# -*- coding:utf-8 -*-
# author: lai zheng laing
# datetime: 2020/10/19 9:45
# software: PyCharm
import requests, re, csv, sys, time
from lxml import html
from fake_useragent import UserAgent
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
folder_path =filedialog.askdirectory() # 获得选好的文件夹
file_path = filedialog.askopenfilename() # 获得对应的文件夹
print(file_path)
'''选取保存某个文件下的路径
实现选取的文件获得url
'''
with open(file_path) as file_name:
reader =csv.reader(file_name)
result = list(reader)
url_path = list(result[0:3])
print(url_path)
# print(url_path[1][1]) # 读取特定列表下的特定的值
url_word = url_path[0][:]
del url_word[0]
# 记录起始时间
startTime = time.time()
# 创建CSV文件,并写入表头信息
fp = open('E:\The_data_for_topiclcrlaw\my_url\my_url.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(fp)
writer.writerow("URL")
# -------------------主函数-------------------------
def main():
qtext = url_word[0]
for i in range(1,5):
url = 'https://search.cctv.com/search.php?qtext={}&sort=relevance&type=web&vtime=&datepid=1&channel=&page={}'.format(qtext,i) # 125177第一篇文章
# url = spider_html(url)
try:
headers = {
"User-Agent": UserAgent().chrome # chrome浏览器随机代理
}
# html乱码的问题,进行转码
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
text_html = response.text
# 提取被注释的html代码里的内容
text_html = text_html.replace(r'<!--', '"').replace(r'-->', '"')
text_html = html.fromstring(text_html)
text_list = text_html.xpath('//ul//h3[@class="tit"]/span/@lanmu1')
writer.writerow(text_list)
except:
pass
print(text_list)
print(len(text_list))
if __name__ == '__main__':
main()
#!/usr?bin/env/ python
# -*- coding:utf-8 -*-
# author: lai zheng laing
# datetime: 2020/10/17 9:27
# software: PyCharm
import requests, re, csv, sys, time
from lxml import html
from fake_useragent import UserAgent
# 记录起始时间
startTime = time.time()
# 创建CSV文件,并写入表头信息
fp = open('E:\The_data_for_topiclcrlaw\cctv_te_langpu_data\关于特定关键词的检索.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(fp)
writer.writerow(("标题", "起始网页","时间", "URL", "正文内容"))
# -----------------------抓取数据爬虫函数-----------------------
def spider_html_info(url):
try:
headers = {
"User-Agent": UserAgent().chrome # chrome浏览器随机代理
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
text_html = response.text
text_html = html.fromstring(text_html)
print(text_html)
# 获取下一页链接,先其他元素获取一页链接,保证程序的强壮性
# next_url = "http://news.cctv.com" + text_html.xpath('/html/body/div[2]/div[1]/ul[1]/li[2]/a[2]/@href')[0]
# print("next_url", next_url)
# 获取多个文章标题
try:
article_title = text_html.xpath('//*[@id="title_area"]/h1//text()')
title = "".join(article_title)
if title == " ":
pass
# title = "".join(text_html.xpath('//*[@id="page_body"]/div[1]/div[1]/div[1]//text()'))
print("title = ", title)
except:
pass
# 获取发布的时间
try:
publish_time = text_html.xpath('//*[@id="title_area"]/div//text()')
print("publish_time= ",publish_time)
except:
pass
try:
print("url = ", url)
except:
pass
# 获取该条新闻的来源
try:
source_text = text_html.xpath('//*[@id="title_area"]/div/a/@href')
source = source_text[3:]
except:
pass
# 爬文本内容
try:
text_list = text_html.xpath('//*[@id="content_area"]//text()')
article_text = "".join(text_list)
#print(text_list)
# article_text = "".join(text_list).replace('\r\n', '').replace("\xa0", "").replace("\t", "").replace(source_text,
# "").replace(
# title, "")
print("article_text= ",article_text)
except:
pass
writer.writerow((title, source_text, publish_time, url, article_text,))
except:
pass
# if url == 'http://www.chinanpo.gov.cn/1944/123496/index.html':
# fp.close()
# # 获取结束时的时间
# endTime = time.time()
# useTime = (endTime - startTime) / 60
# print("该次所获的信息一共使用%s分钟" % useTime)
# # 正常退出程序
# sys.exit(0)
# else:
# return next_url
# -------------------主函数-------------------------
def main():
# url = 'https://news.cctv.com/2020/10/17/ARTIp0AnISoJeLZW79bkffYW201017.shtml' # 125177第一篇文章
file_path = 'E:/The_data_for_topiclcrlaw/my_url/my_url.csv'
# url = spider_html_info(url)
with open(file_path) as file_name:
reader = csv.reader(file_name)
result = list(reader)
del result[0]
a = (len(result[:][:]))
b = (len(result[0][:]))
for i in range(a):
for j in range(b):
url = spider_html_info(result[i][j])
# for url in my_url_list:
# url = spider_html_info(url)
# while True:
# print("正在爬取第%s篇:" % count, url)
# next_url = spider_html_info(url)
# url = next_url
# count = count + 1
if __name__ == '__main__':
main()