这是爬取cnBeta的爬虫代码.
网页分析花了好久的时间,但是不如意.
分析这一块没有做好,希望有能之士帮忙指出方法,非常感谢.
import requests
import re
from bs4 import BeautifulSoup
def article_num():
headers = {
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url = 'https://www.cnbeta.com/'
wb_data = requests.get(url, headers=headers)
wb_data.encoding = 'utf-8'
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('dt > a')
url = titles[0].get('href')
article_num = re.findall('\d+', url).pop(0)
#print("首个数字为: " + article_num)
return int(article_num)
def get_url(num):
return 'https://www.cnbeta.com/articles/%d.htm' % num
the_num = article_num()
def get_data():
global the_num
url = get_url(the_num)
the_num -= 2
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'cookie': '__utmz=208385984.1534339138.1.1.utmccn=(direct)|utmcsr=(direct)|utmcmd=(none); trc_cookie_storage=taboola%2520global%253Auser-id%3D0f0a0768-d08e-4468-83fc-2bec021e4166-tuct26dadc3; __utma=208385984.785811501.1534339134.1534699205.1534821982.6; _ga=GA1.2.785811501.1534339134; _gid=GA1.2.1661929009.1535084439; Hm_lvt_4216c57ef1855492a9281acd553f8a6e=1535114014,1535126454,1535134240,1535161494; _csrf=7dc1d4a83689bc5f24439fd322a9116a3819f55dc86f406b07a1f600bbc49419a%3A2%3A%7Bi%3A0%3Bs%3A5%3A%22_csrf%22%3Bi%3A1%3Bs%3A32%3A%22GTPGe79wMADBkqDIcKTpiBavtP0dM7Ao%22%3B%7D; Hm_lpvt_4216c57ef1855492a9281acd553f8a6e=1535166727; _gat_gtag_UA_124336984_1=1',
'pragma': 'no-cache',
'referer': 'https://www.cnbeta.com/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
wb_data = requests.get(url, headers=headers)
wb_data.encoding = 'utf-8'
soup = BeautifulSoup(wb_data.text, 'lxml')
# print(soup)
data = {}
title = soup.select('header.title > h1')
# print(title)
title =title[0].text
data['title'] = title
article_summry = soup.select('div.article-summary > p')
summary = article_summry[0].text
article_content = soup.select('div.article-content > p')
all_content = summary
for content in article_content:
all_content = all_content + '\n\n' + content.text.replace('\n', '')
data['content'] = all_content
return data
def main(num):
with open('.//wenzhang/' + 'cnBeta1.txt','w',encoding='utf-8') as f:
for i in range(1, num + 1):
try:
data = get_data()
f.write('>> ' + str(i) + ' >>>> ' + data['title'] + ' <<<<\n\n')
f.write(data['content'] + '\n\n---------------------------------\n\n\n\n')
print(str(the_num + 2) + " OK ...", end='\n------------------------\n\n\n')
except:
f.write('>> ' + str(i) + ' >>>> ' + str(the_num + 2) + " 这个有问题..." + ' <<<<\n\n' + '\n\n---------------------------------\n\n\n')
print(str(the_num + 2) + " Noooooo ...", end='\n------------------------\n\n\n')
num = 10
main(num)