爬取新浪网页

 # 唯一性的用id表示,id前面需要加#

例如:使用select ,找出所有id为title 的元素

alink = soup.select('#title')
print alink
print alink.text

# 有相同的用class表示,class前面需要加.

例如:使用select 找出所有class为link的元素

for link in soup.select('.link'):
    print link
    print link.text

 #使用select 找出所有a tag 的href连结

alinks =soup.select('.a')
for link in alinks:
    print link['href']
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup 
import json
import pandas


#取得评论数与评论内容
# jd =json.loads(comments.text.strip('var data='))
# jd['result']['count']['total']

# class 时,加. ;id 时,加#
# soup.select('.article-editor')[0].text.lstrip('责任编辑')
# soup.select('#commentcount1')

# 将抓取评论数方法整理成一函式
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=is&channel=an&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page size=20'

def getCommentCount(newsurl):
	m=re.search('doc-i(.*).shtml',newsurl)
	newsid = m.group(1)
	comments = requests.get(commentURL.format(newsid))
	jd = json.loads(comments.text.strip('var data='))
	return jd['result']['count']['total']

# 将抓取内文信息方法整理成一函数

def getNewsDetail(newsurl):
	result = {}
	res = requests.get(newsurl)
	res.encoding = 'utf-8'
	soup = BeautifulSoup(res.text,'html.parser')  # or 'lxml'
	result['title'] = soup.select('#artibodyTitle')[0].text
	result['newssource'] = soup.select('.time-source span a')[0].text
	timesource = soup.select('.time-source')[0].contents[0].strip()
	result['dt'] = datetime.striptime(timesource,'%Y年%m月%d日%H:%M')
	result['article'] = ''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
	result['editor'] = soup.select('.article-editor')[0].text.strip('责任编辑 :')
	result['comments'] = getCommentCount(newsurl)
	return result

#找到分页连结
# 1.选择Network页签
# 2.点选JS
# 3.点找到连结 (包含page = ?)

# 剖析分页信息
res = requests.get('http"//.....page=1&....') # 简写,可能每个网址的分页不同
jd = json.loads(res.text.lstrip('...').rstrip('....')) # 移除左右多余字符串
for ent in jd['result']['data']:
	print ent['url']   # 印出每页连结

# 建立剖析清单链接函式
def parseListLinks(url):
	newsdetails = []
	res = requests.get(url)
	jd = json.loads(res.text.lstrip('newsloadercallback(').rstrip(');'))
	for ent in jd['result']['data']:
		newsdetails.append(getNewsDetail(ent['url']))
	return newsdetails

# 批次抓取每页新闻内文
url = 'http://......page={}'
news_total = []
for i in range(1,3):
	newsurl = url.format(i)
	newsary = parseListLinks(newsurl)
	news_total.extend(newsary)

# 将资料保存至excel
df.to_excel('news.xlsx')
# 将资料保存至资料库
import sqlite3
with sqlite3.connect('news.sqlite') as db:
	df.to_sql('news',con=db)

# 从资料库读取
import sqlite3
with sqlite3.connect('news.sqlite') as db:
	df2 = pandas.read_sql_query('SELECT * FROM news',con=db)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值