这几天通过爬取一些新闻语料,总结一下少许经验:
1,可以手动寻找网页规律,将需要爬取的网址写入txt中。
2,设置header,模拟浏览器,以免发现是爬虫而中断
self.header = {
# 'cookie': '__gads=ID=510d384eea0592e5:T=1561861082:S=ALNI_Ma9KgyBvcWfhnUp8XkVkIB6TJRzog; _ga=GA1.2.1810481865.1561861082; vruid=e774-e455-6bbe-1561867251; has_js=1; Hm_lvt_94444c9ea980451456647b253e40ad9f=1564705106,1565263545,1565447158,1565675611; _gid=GA1.2.4428587.1565675613; sessionStatusZB=1; pa-submit=.3114-1565675835825; Hm_lpvt_94444c9ea980451456647b253e40ad9f=1565675883',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
3,记得关闭响应,伪装的更加细节:
res.close()
time.sleep(1)
4,爬取动态网页:
首先F12,找到network,找到JS,可以自己手动滑动鼠标观察生成的新文件,点开这个新文件,找到
Request URL: https://www.cyberctm.com/zh_TW/news/lazyload?type=%E5%9C%8B%E9%9A%9B%E6%96%B0%E8%81%9E&page=2&limit=10&offset=10
这个就是动态链接,搜索它,其实是json文件,通过下面代码加载:
from urllib import request
with request.urlopen(url, timeout=10) as response: #url=上面的Request URL
content = response.read()
print(content)
data_list = json.loads(content)
print(type(data_list))
print(len(data_list))
for i in data_list:
text_1 = i['content_t']
text_2 = text_1.replace('<P>', '')
text_3 = text_2.replace('</P>', '')
text_4 = text_2.replace(' ', '')
print(text_4)
4,实例代码:
from bs4 import BeautifulSoup
import re, json
import requests, os
from datetime import datetime
import numpy as np
import random
class downloader(object):
def __init__(self, url_target):
self.target = url_target # 目标网址
self.header = {
# 'cookie': '__cfduid=d9f7d437c55121035c533fec00ae560551565261934; cf_clearance=1bf1420cd0d784aed9a3a990968d1755787e5cd7-1565261948-604800-250; _ga=GA1.3.1012363973.1565261950; EU_COOKIE_LAW_CONSENT=true; _ym_uid=1565262940639941638; _ym_d=1565262940; ivx_990498=55; ivx_990397=265; ivx_990372=180; ivx_990158=60; _gid=GA1.3.432812653.1565671800; _ym_isad=1; _ym_wasSynced=%7B%22time%22%3A1565675010184%2C%22params%22%3A%7B%22eu%22%3A0%7D%2C%22bkParams%22%3A%7B%7D%7D; _dc_gtm_UA-33948949-1=1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
def get_fanye_url(self, target):
req = requests.get(url=target, headers=self.header) # 获取对象
req.encoding = "utf-8" # 设置编码格式
html = req.text # 获得网页源代码
bf = BeautifulSoup(html, 'lxml') # 利用BeautifulSoup进行解析
fanye = []
fanye_code = bf.find_all('a', href = re.compile('column/node_29'))
for i in fanye_code:
print(i)
if len(fanye_code) == 0:
fanye.append(target)
else:
for x in fanye_code: # 找到目标a标签
link = x.get('href') # 提取链接
if link:
print('fanye_url', link)
fanye.append(link) # 存入列表
fanye.pop()
return fanye
def get_news_url(self, fanye):
req = requests.get(fanye, headers=self.header) # 获取对象
req.encoding = "utf-8" # 设置编码格式
html = req.text # 获得网页源代码
bf = BeautifulSoup(html, 'lxml') # 利用BeautifulSoup进行解析
news_linklist = [] # 存入翻页地址
news_url = bf.find_all('a', href = re.compile('http://www.takungpao.com/news/232108/'))
for x in news_url: # 找到目标a标签
link = x.get('href') # 提取链接
if link:
# print('news_url:', link)
news_linklist.append(link) # 存入列表
return news_linklist
def getnewsdetail(self, newsurl): # 获得单页的新闻内容
result = {}
res = requests.get(url=newsurl, headers=self.header)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
# result['title'] = soup.select('.main-title')[0].text # 标题
# timesource = soup.select('.date-source span')[0].text
# result['time'] = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M').strftime('%Y-%m-%d') # 时间
# result['place'] = soup.select('.source')[0].text # 来源
article = [] # 获取文章内容
for p in soup.select('p'):
article.append(p.text.strip())
articleall = ' '.join(article)
print('article:', articleall)
result['article'] = articleall
# result['editor'] = soup.select('#article p')[-1].text.strip('责任编辑:') # 获取作者姓名
return articleall
if __name__ == "__main__":
starttime = datetime.now()
url_chain = 'http://www.takungpao.com.hk/hongkong/'
url_chain_2 = 'http://www.takungpao.com.hk/mainland/'
url_chain_3 = 'http://www.takungpao.com.hk/taiwan/'
url_chain_4 = 'http://www.takungpao.com.hk/international/'
url_chain_5 = 'http://www.takungpao.com.hk/finance/'
url_chain_6 = 'http://www.takungpao.com.hk/culture/'
url_chain_7 = 'http://www.takungpao.com.hk/sports/'
url_chain_8 = 'http://www.takungpao.com.hk/ent/'
url_chain_9 = 'http://renwen.takungpao.com/html/100/'
url_chain_10 = 'http://renwen.takungpao.com/html/800/'
url_chain_11 = 'http://www.takungpao.com/news/index.html'
url_chain_12 = 'http://www.takungpao.com/news/232108/index.html'
dl = downloader(url_chain_10)
print('start getting fanye_url...')
# fanye = dl.get_fanye_url(dl.target)
# for x in fanye:
# b = dl.get_fanye_url(x)
# for w in b: # 这一个循环的目的是获取翻页链接的同时去重
# if w not in fanye:
# if w == 'https://www.zaobao.com/news/china?page=2':
# break
# fanye.append(w)
# print("翻页链接" + w)
fanye = []
for i in range(2, 11):
url = 'http://www.takungpao.com/news/232108/' + str(i) + '.html'
fanye.append(url)
print('fanye_lenth:', len(fanye))
print('start getting news url...')
# news_url = ['https://www.sinchew.com.my/content/content_2098401.html']
news_url = []
for x in fanye:
a = dl.get_news_url(x)
for w in a: # 这一个循环的目的是获取新闻链接的同时去重
if w not in news_url:
news_url.append(w)
print("新闻地址" + w)
print('news_url_lenth:', len(news_url))
print('start getting news content...')
count = 1
name = 'china.txt'
txt_dir = '../data/xiang_gang03.txt'
for news_index in news_url:
count += 1
news_content = dl.getnewsdetail(news_index)
print('The ' + str(count) + ' news_content is writting...')
f = open(txt_dir, 'a', encoding='utf-8')
f.write('The ' + str(count) + ' news:' + '\n')
f.write(news_content + '\n'*3)
endtime = datetime.now()
cost_time = endtime - starttime
f = open(txt_dir, 'a', encoding='utf-8')
f.write('\n'*3 + str(cost_time))
print('The cost time :', cost_time)
# json_str = json.dumps(xinwen, ensure_ascii=False, indent=1)
# with open(txt_dir, 'w') as json_file:
# json_file.write(json_str)