Ajax网页消息爬取练习,来自《Python3网络爬虫开发实战》
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
import pymysql
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
# 如果 requestedWith 为 null,则为同步请求。
# 如果 requestedWith 为 XMLHttpRequest 则为 Ajax 请求。
}
def get_page(page):
params = {
'type': 'uid',
'value': '2145291155',#who
'containerid': '1076032145291155',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json() #json好用
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] =BeautifulSoup(item.get('text'),"lxml").text
#奇怪,这里的 .string不行none,而.text可以
#好吧,原因是string返回第一个string,而strings则是全部,当第一个是nono,也就G了
#If this tag has no children, or more than one child, return value is None.
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
def initmysql():
db = pymysql.connect(host='localhost', user='root', password='', port=3306, db='spider')
print('连接数据库成功!!!')
cursor = db.cursor()
sql = """CREATE TABLE IF NOT EXISTS weibo (
id VARCHAR(20) NOT NULL,
text VARCHAR(256) ,
attitudes CHAR(10) ,
comments CHAR(10),
PRIMARY KEY (id));
"""
#text仅256位,其他大文本可考虑blob clob
cursor.execute(sql)
print('表spider连接成功')
return db,cursor
###*********************************************
#这里本来想写成函数但是这样的话,inerfaceerror,查了可能是无法连接数据库,或者游标错误
###**********************************************
##然而发现是作用域问题 cursor db
def save_to_mysql(result):#最后自己加close哈 ###cursor得引入啊!!!
sql = 'insert into weibo(id, text, attitudes, comments) values(%s, %s, %s, %s)'
try:
cursor.execute(sql,(result['id'],result['text'],result['attitudes'],result['comments']))
db.commit()
except:
db.rollback()
if __name__=='__main__':
db,cursor = initmysql()
for page in range(1,2):#两页
results = parse_page(get_page(page))
for result in results:
print(result)
save_to_mysql(result)
db.close()
print('存储完毕')
最后把微博某个人的微博消息,点赞,评论,转发等提取出来到mysql里面。
程序的封装还是很差,,,