环境
- windows, python 2.7
- lxml, 正则等解析html 获取数据
- python mysqldb
- urllib, urllib2等用作http请求相关
py模拟登录csdn
参考http://blog.csdn.net/yanggd1987/article/details/52127436
chrome F12监测登录
可以看到http请求和响应的相关信息
python mysqldb
参考
http://blog.csdn.net/followingturing/article/details/7956196
测试
数据表.sql
/*
Navicat MySQL Data Transfer
Source Server : mysql
Source Server Version : 50155
Source Host : localhost:3306
Source Database : test
Target Server Type : MYSQL
Target Server Version : 50155
File Encoding : 65001
Date: 2017-11-16 10:04:00
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for blog
-- ----------------------------
DROP TABLE IF EXISTS `blog`;
CREATE TABLE `blog` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`url` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=latin1;
-- ----------------------------
-- Records of blog
-- ----------------------------
INSERT INTO `blog` VALUES ('2', 'a', 'b');
INSERT INTO `blog` VALUES ('4', 'c', 'd');
mysqlop.py脚本
# -*- coding: UTF-8 -*
import MySQLdb
class MySQLHelper:
"""
MySQL 数据库封装
"""
def __init__(self, host, user, password, charset = "utf8"):
self.host = host
self.user = user
self.password = password
self.charset = charset
try:
self.conn = MySQLdb.connect(host = self.host, user = self.user, passwd = self.password)
self.conn.set_character_set(self.charset)
self.cur = self.conn.cursor()
except MySQLdb.Error as e:
print("Mysql Error %d: %s" % (e.args[0], e.args[1]))
def selectDb(self, db):
try:
self.conn.select_db(db)
except MySQLdb.Error as e:
print("Mysql Error %d: %s" % (e.args[0], e.args[1]))
def query(self, sql):
try:
n = self.cur.execute(sql)
return n
except MySQLdb.Error as e:
print("Mysql Error:%s\nSQL:%s" %(e,sql))
def queryRow(self, sql):
self.query(sql)
result = self.cur.fetchone()
return result
def queryAll(self, sql):
self.query(sql)
result = self.cur.fetchall()
desc = self.cur.description
d = []
for inv in result:
_d = {}
for i in range(0, len(inv)):
_d[desc[i][0]] = str(inv[i])
d.append(_d)
return d
def insert(self, p_table_name, p_data):
for key in p_data:
p_data[key] = "'" + str(p_data[key]) + "'"
key = ','.join(p_data.keys())
value = ','.join(p_data.values())
real_sql = "INSERT INTO " + p_table_name + " (" + key + ") VALUES (" + value + ")"
#self.query("set names 'utf8'")
return self.query(real_sql)
def getLastInsertId(self):
return self.cur.lastrowid
def rowcount(self):
return self.cur.rowcount
def commit(self):
self.conn.commit()
def close(self):
self.cur.close()
self.conn.close()
def test_insert():
dbhelper = MySQLHelper("localhost", "root", "123456")
dbhelper.selectDb("test")
p_data = {'title': 'c', 'url': 'd'}
rs = dbhelper.insert("blog", p_data)
dbhelper.commit()
dbhelper.close()
def test_queryall():
dbhelper = MySQLHelper("localhost", "root", "123456")
dbhelper.selectDb("test")
rs = dbhelper.queryAll("select * from blog;")
for row in rs:
print row
print row['title'] + "," + row['url']
dbhelper.close()
if __name__ == '__main__':
#test_insert()
test_queryall()
模拟登录csdn,数据写入MySQL
# -*- coding: UTF-8 -*
import mysqlop
import urllib
import urllib2
import re
import cookielib
from bs4 import BeautifulSoup
filename = 'cookie_csdn.txt'
#声明一个MozillaCookieJar对象实例来保存cookie,之后写入文件
cookie = cookielib.MozillaCookieJar(filename)
#利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器
handler = urllib2.HTTPCookieProcessor(cookie)
#通过handler来构建opener
opener = urllib2.build_opener(handler)
loginurl = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"
#登陆前准备:获取lt和exection
response = opener.open(loginurl)
#登陆前准备:获取lt和exection
response = opener.open(loginurl)
soup = BeautifulSoup(response.read(), "lxml")
for input in soup.form.find_all("input"):
if input.get("name") == "lt":
lt = input.get("value")
if input.get("name") == "execution":
execution = input.get("value")
#print lt
#print execution
#post form信息
values = {
"username": "修改成自己的CSDN用户名",
"password": "修改成自己的CSDN密码",
"lt": lt,
"execution": execution,
"_eventId": "submit"
}
postdata = urllib.urlencode(values )
user_agent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
opener.addheaders = [("User-agent", user_agent)]
#模拟登录,保存cookie到cookie.txt中
request = urllib2.Request(loginurl, postdata)
url = opener.open(request)
rs = url.read()
#print rs
cookie.save(ignore_discard=True, ignore_expires=True)
#登陆后我们随意跳转到博客,这里是我的博客首页(有15篇文章)
url = "http://blog.csdn.net/qq_26437925/article/list/1"
result = opener.open(url)
page = result.read()
soup2 = BeautifulSoup(page.decode("utf-8"), "lxml")
rs1 = soup2.select('div #article_list > div')
for item in rs1:
#print item
# 找 div article_title
tmp = item.find('div', attrs={'class': 'article_title'})
# 接着找a标签
tmp2 = tmp.find('a')
#print type(tmp2)
pattern = '.*<a.*href="(.*)".*>\r*\n*(.*)\r*\n*</a>.*'
tmp4 = re.findall(pattern, str(tmp2), re.S|re.M)
articles = []
for nn in tmp4:
#print len(nn)
title_url = "http://blog.csdn.net" + nn[0]
title_name = nn[1]
# 这里不知道前面的8个位置,最后22个位置的什么东西(空格 \t,\r \n之类)
len2 = len(title_name)
title_name = title_name[8: len2-22]
# title_name.replace('\n', 'x')
# title_name.replace('\t','x')
# title_name.replace('\r', 'x').strip('\n').strip()
articles.append({"title_name": title_name , "title_url": title_url})
dbhelper = mysqlop.MySQLHelper("localhost", "root", "123456")
dbhelper.selectDb("test")
# 插入每一篇 文章名 和 完整的URL
for a in articles:
ss = a["title_name"] + ": " + a["title_url"]
# windows cmd下乱码,采用了如下的打印方式
print '%s' %(ss.decode('utf-8'))
# 插入mysql数据库
#p_data = {'title': a["title_name"], 'url': a["title_url"]}
#dbhelper.insert("blog", p_data)
#dbhelper.commit()
dbhelper.close()
运行结果