(一)基于python的requests库,模拟登陆
1 爬取公司信息,不用保持cookie的情况,代码如下:
# -*- coding: utf-8 -*- import requests import requests import re import MySQLdb import mysql.connector import MySQLdb as mdb Conn = MySQLdb.connect(user='wenrui', passwd='wenrui', host='localhost', db='companydata', charset="utf8") cur = Conn.cursor() cur.execute( "CREATE TABLE CompanyData(name VARCHAR(70),realnam VARCHAR(20),phone VARCHAR(20),mail VARCHAR(20),Bigaddress varchar(10),Smladdress varchar(70))") Conn.commit() class Company: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6', 'Cookie': 'Hm_lvt_3ac4a19eb5f266a8046f9f5b29c52a00=1447818297; Hm_lpvt_3ac4a19eb5f266a8046f9f5b29c52a00=1447818297; so=296ef4827c1eabe98da0650c74e0bc7e; vr_1447818247=296ef4827c1eabe98da0650c74e0bc7e; un=2af45194a1467fca68d2ab4b6b0c083a58ebf139; zh_CN=zh_CN; PLAY_SESSION="74b2f9223df0e04bd8d69cc881072bb46fce9e63-userId=418920"' #直接把cookie写入请求头即可 } self.rname = ur'target="_blank">( .*?)</a>' self.rrealname = ur'联系人:(.{1,6})&' # 匹配联系人:后面任意一个1-4次,遇到&后结束 self.rphone = r'<label id=".*?">(.*?)</label>' self.rmail = ur'href="mailto.*>(.*?)</a>' self.raddress = ur"<font color='red'>(.*?)</font>(.*?)</p>" # self.raddress=ur'地.*址:(.*)' def getPage(