分析目标页面
爬取代理ip的地址:http://www.xicidaili.com/
页面分析:
ip在table(id=ip_list)中按照行存放,只要遍历table对象中每个行 tr ,就可以取到每行的数据,再取出每个列 td 中的内容就可以,总的来说比较简单。
代码示例
import requests
from bs4 import BeautifulSoup
import xlsxwriter
import sqlite3
import time
def get_html_text(url):
"""获取网页,返回文本格式"""
try:
headers = {
"User-Agent":"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"""
}
r = requests.get(url, headers=headers)
r.raise_for_status() # 状态不是200,抛出异常
r.encoding = r.apparent_encoding # 编码
return r.text
except:
return "产生异常"
def get_proxies():
"""获取代理ip,以[{},{}]形式返回"""
url = "http://www.xicidaili.com/"
html = get_html_text(url)
soup = BeautifulSoup(html, "html.parser")
ip_list = soup.find(id="ip_list")
proxies = []
for tr in ip_list.find_all("tr"):
try:
proxy = {}
# ["代理IP地址", "端口", "服务器地址", "是否匿名", "类型", "存活时间", "验证时间"]
tds = tr.find_all("td")
ip = tds[1].string
port = tds[2].string
addr = tds[3].string
anonymous = tds[4].string
typ = tds[5].string
alive = tds[6].string
check = tds[7].string
proxy["ip"] = ip
proxy["prot"] = port
proxy["addr"] = addr
proxy["anonymous"] = anonymous
proxy["type"] = typ
proxy["alive"] = alive
proxy["check"] = check
proxies.append(proxy)
except:
continue
return proxies
def save_list_to_xlsx(lst):
# 将列表数据保存到excel表格中,不推荐
# 表头
titles = ["代理IP地址", "端口", "服务器地址", "是否匿名", "类型", "存活时间", "验证时间"]
# 新建工作薄
book = xlsxwriter.Workbook("ip_list.xlsx")
sheet = book.add_worksheet("sheet1")
row = 0 # 行号
col = 0 # 列号
# 表头写入excel
for title in titles:
sheet.write(row, col, title)
col += 1
row += 1
# 写入每条记录
for dct in lst:
print(dct)
sheet.write(row, 0, dct.get("ip"))
sheet.write(row, 1, dct.get("prot"))
sheet.write(row, 2, dct.get("addr"))
sheet.write(row, 3, dct.get("anonymous"))
sheet.write(row, 4, dct.get("type"))
sheet.write(row, 5, dct.get("alive"))
sheet.write(row, 6, dct.get("check"))
row += 1
book.close()
return row
class Database(object):
"""连接数据库"""
def __init__(self, name):
self.name = name
self.conn = sqlite3.connect(self.name)
self.cursor = self.conn.cursor()
def create_table(self, tablename):
"""创建工作表"""
self.tablename = tablename
sql = """create table if not exists %s(
"id" integer primary key autoincrement,
"ip" text,
"port" integer,
"addr" text,
"anonymous" text,
"type" text,
"alive" text,
"check" text,
"status" integer default 1
)"""%self.tablename
self.cursor.execute(sql)
def insert(self, data):
"""插入数据"""
self.cursor.execute("""insert into ip_list("ip", "port", "addr", "anonymous",
"type", "alive", "check")values(?,?,?,?,?,?,?)""", data)
self.conn.commit()
def get_random_ip(self):
"""随机获取一个ip"""
sql = "select ip, port from %s where state!=0 order by random() limit 1"%(self.tablename)
self.cursor.execute(sql)
for ip, port in self.cursor.fetchall():
# print("ip:", ip, "port:", port)
if self.verify_ip(ip, port): # 验证ip
return (ip, port)
else:
return get_random_ip()
def verify_ip(self, ip, port):
"""验证ip有效性"""
http_url = "http://www.baidu.com"
proxy_url = "https://{}:{}".format(ip, port)
proxies = {
"https": proxy_url
}
try:
r = requests.get(http_url, proxies=proxies)
except:
self.delete_ip(ip)
return False
else:
# code [200,300)之间则为有效的
if r.status_code >=200 or r.status_code<300:
return True
else:
self.delete_ip(ip)
return False
def delete_ip(self, ip):
"""删除ip记录"""
# sql = "delete from %s where ip = %s"% (self.tablename, ip)
sql = "update %s set status=0 where ip =%s"%(self.tablename, ip)
self.cursor.execute(sql)
self.conn.commit()
def __del__(self):
"""释放数据库连接"""
self.cursor.close()
self.conn.close()
def add_list_to_database(lst):
"""插入到数据库"""
database = Database("ip_pool.db")
count = 0 # 计数
database.create_table("ip_list")
for dct in lst:
data = (dct.get("ip"), dct.get("prot"), dct.get("addr"), dct.get("anonymous"),
dct.get("type"), dct.get("alive"), dct.get("check"))
database.insert(data)
count += 1
return count
if __name__ == '__main__':
# 获取代理ip并存入数据库
proxies = get_proxies()
ret = add_list_to_database(proxies)
print(ret)
# 测试ip可用性
database = Database("ip_pool.db")
database.create_table("ip_list")
for i in range(100):
ip, port = database.get_random_ip()
print(ip, port)
参考:
《小白动手搭建一个简单爬虫代理池》
《学会最简单的数据库|看完这7招就够了》
《SQLite 教程》
http://www.runoob.com/sqlite/sqlite-tutorial.html
《Beautiful Soup 4.2.0 文档》
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id5
《requests快速上手》
http://cn.python-requests.org/zh_CN/latest/user/quickstart.html