-
- 实验目的
- 熟练掌握爬虫采集互联网数据的全过程;
- 了解互联网网站的一些常用反爬虫机制。
- 实验工具
- 编程语言:Python;
- 开发环境:PyCharm(或Sublime Text、Eclipse + PyDev、Visual Studio等);
- 常用模块:scrapy、urllib、request等。
- 实验题目
热点时事新闻文章采集:
- 仅下载当天最新、热点的时事新闻;
- 不同网站的新闻保存在不同文件夹中,并记录每篇新闻的来源、标题、发布时间、下载时间、url地址等信息;
- 爬虫初始种子:新浪(news.sina.com.cn)、搜狐(news.sohu.com)、凤凰(news.ifeng.com)、网易(news.163.com)、百度(news.baidu.com)。
- 实验步骤
1. 安装Python、PyCharm开发环境,并下载爬虫所需的开发模块;
2. 建立相关爬虫项目,从不同新闻网站下载热点新闻文章。(请附上详细代码、爬虫下载截图、运行效果等内容)
1.新浪
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.sina.com.cn/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {'class': 'ct_t_01'})#找到新闻标题的所在标签名称
i = 0
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')
ls = pattern.findall(href)
print(ls)
title = [[] for _ in range(25)]
data = [[] for _ in range(25)]
source = [[] for _ in range(25)]
while i < ls.__len__():
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='main-title').get_text()
data[i] = soup.find('span', class_='date').text
source[i] = soup.find('a', class_='source').text
s1 = soup.findAll('div', {'class': 'article'})
for each in s1:
hr = each.select('p')
# print(hr)
hr = str(hr)
findjs = re.compile(r'<p.*?>\u3000\u3000(.*?)</.*?>')
js=findjs.findall(hr)
# print(js)
file3 = open(r'新浪/%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
i = i+1
- 搜狐
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'http://news.sohu.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {'id': 'block4'})#找到新闻标题的所在标签名称
i = 0
k = 0
# print(p1)
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="(.*?)" ')
l = pattern.findall(href)
prefix = 'http://news.sohu.com'
ls = [prefix + url for url in l]
print(ls)
title = [[] for _ in range(50)]
data = [[] for _ in range(50)]
source = [[] for _ in range(50)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1').text
title[i]= ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))
print(title[i])
data[i] = soup.find('span', class_='time').text
source[i] = soup.find('span', {'data-role': 'original-link'}).text.strip()
s1 = soup.findAll('article', {'class': 'article'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open(r'搜狐/%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
i = i+1
- 凤凰
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.ifeng.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {'class': 'index_content_RQk8t'})#找到新闻标题的所在标签名称
i = 0
print(p1)
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="//(.*?)" ')
l = pattern.findall(href)
prefix = 'http://'
ls = [prefix + url for url in l]
print(ls)
title = [[] for _ in range(100)]
data = [[] for _ in range(100)]
source = [[] for _ in range(100)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='index_topic_5hyUE').get_text()
print(title[i])
data[i] = soup.find('div', class_='index_timeBref_20hzr').text
print(data[i])
source[i] = soup.find('div', class_='index_sourceTitleText_wlTy-').text
print(source[i])
s1 = soup.findAll('div', {'class': 'index_main_content_j-HoG'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open(r'凤凰/%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
i = i+1
- 网易
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.163.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {'class': 'news_default_news'})#找到新闻标题的所在标签名称
i = 0
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'[a-zA-z]+://[^\s]*ml')
ls = pattern.findall(href)
title = [[] for _ in range(25)]
data = [[] for _ in range(25)]
source = [[] for _ in range(25)]
while i < ls.__len__():
try:
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='post_title').get_text()
title[i] = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', title[i].strip()))
print(title[i])
div_tag = soup.find('div', class_='post_info')
data[i] = div_tag.get_text(strip=True).split(' ')[0]
print(data[i])
source[i] = soup.find("div", class_="post_info").find("a").text
print(source[i])
s1 = soup.findAll('div', {'class': 'post_body'})
for each in s1:
hr = each.select('p')
#print(hr)
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js=findjs.findall(hr)
print(js)
file3 = open(r'网易/%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
except:
i = i + 1
continue
i = i+1
- 百度
import requests
import bs4
import re
import datetime
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
}
url = 'https://news.baidu.com/'
response = requests.get(url, headers=headers)
print(response.status_code)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
p1 = soup.findAll('div', {'class': 'mod-tab-pane active'})#找到新闻标题的所在标签名称
i = 3
#print(p1)
for each in p1:
href = each.select('a')#即a标签下的href
#detail_url = href.get('href')
print(href)
href = str(href)
pattern = re.compile(r'href="(.*?)" ')
ls = pattern.findall(href)
print(ls)
title = [[] for _ in range(100)]
data = [[] for _ in range(100)]
source = [[] for _ in range(100)]
while i < ls.__len__():
print(ls[i])
response = requests.get(ls[i], headers=headers)
response.encoding = 'utf-8'
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
title[i] = soup.find('h1', class_='title').get_text()
pattern = re.compile(r'[\u4e00-\u9fa5]+') # 匹配中文字符
title[i] = ''.join(pattern.findall(title[i])) # 提取中文并拼接成字
print(title[i])
data[i]=soup.find('div', class_='media-publish-time').get_text()
# a=soup.find('span', class_='year').get_text()
# b=soup.find('span', class_='day').get_text()
# c=soup.find('span', class_='time').get_text()
# data[i] = '.'.join([a, b, c])
print(data[i])
source[i] = soup.find('div', class_='source').text
print(source[i])
s1 = soup.findAll('div', {'id': 'detail'})
for each in s1:
hr = each.select('p')
hr = str(hr)
findjs = re.compile(r'<p.*?>(.*?)</.*?>')
js = findjs.findall(hr)
print(js)
file3 = open(r'百度/%s.txt'%title[i], 'w', encoding='UTF-8')
for j in range(len(js)):
s = re.sub(r'<.*?>', '', str(js[j]))
file3.write(s + '\n')
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d %H:%M:%S')
file3.write("爬取时间:" + time_str + "\n")
file3.write("发布时间: " + data[i] + " 来源: " + source[i] + " 标题: " + title[i] + " 网址: " + ls[i] + "\n")
file3.close()
i = i+1
-
- 实验心得
简要介绍你在实验中使用到的各爬虫模块主要功能、爬虫下载的主要步骤、常见的反爬虫机制等?