'''
AUTHOR: vforlove
CREATED: 2018-08-02
IDE:PyCharm
version:3.6.6
'''
import requests
import time
import pymssql
from pyquery import PyQuery as pq
from urllib.parse import urlencode
base_url = 'https://m.weibo.cn/api/container/getIndex?'
table = 'Ajax_weibo'
headers = {
'Referer': 'https://m.weibo.cn/u/1713926427',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
#数据库存储数据预处理(连接数据库,建表)
conn = pymssql.connect(host = 'FORLOVE', user = 'sa', password = '', database = 'spider', autocommit = True)
cursor = conn.cursor()
if not cursor:
print('数据库连接失败!')
else:
cursor.execute('use spider')
print('数据库连接成功!')
sql = """IF NOT EXISTS (SELECT * FROM sysobjects WHERE name='Ajax_weibo') CREATE TABLE Ajax_weibo(text nvarchar (140) NOT NULL, attitudes_count varchar(10), comments_count varchar(10),reposts_count varchar(10), created_at nvarchar(10), primary key(text))"""
cursor.execute(sql)
#定义获取页面函数,返回JSON类型
def get_page(page):
array = {
'type': 'uid',
'value': '1713926427',
'containerid': '1076031713926427',
'page': page
}
url = base_url + urlencode(array)
print(url)
try:
response = requests.get(url, headers = headers)
if response.status_code == 200:
print('requests successful!')
return response.json()
except requests.ConnectionError as e:
print('error', e.args)
#定义解析JSON页面类型函数
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
if item is None:
continue
yield {
'text': pq(item.get('text')).text(),
'attitudes_count': item.get('attitudes_count'),
'comments_count': item.get('comments_count'),
'reposts_count': item.get('reposts_count'),
'created_at': item.get('created_at')
}
#定义数据存储函数
def SQL_Ajax(result):
keys = ','.join(result.keys())
values = ','.join(['%s'] * len(result))
insert_sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
print(insert_sql)
# 数据库插入操作
try:
cursor.execute(insert_sql, tuple(result.values()))
print(tuple(result.values()))
print('INSERT SUCCESSFUL!')
except:
print('INSERT FAILED!')
conn.rollback()
#start
if __name__ == '__main__':
for page in range(1, 6):
json = get_page(page)
results = parse_page(json)
for result in results:
print(result)
SQL_Ajax(result)
time.sleep(1)
错误:
Traceback (most recent call last):
File “D:/Python/src/Ajax-weibo.py”, line 82, in
for result in results:
File “D:/Python/src/Ajax-weibo.py”, line 57, in parse_page
‘text’: pq(item.get(‘text’)).text(),
AttributeError: ‘NoneType’ object has no attribute ‘get’
错误分析:
经过对Ajax文件的分析,发现有的序列里没有mblog造成遍历出错
解决方案:
在页面解析函数中加入
if item is None:
continue
顺便吐槽一下自己的细心程度,在存入数据库的时候无论如何也存不进去数据,commit()函数写了,也改了AUTOCOMMIT,还是不行,最后发现是存入语句的insert_sql写成了sql,sql是上面建表的语句,难瘦!!!