我一直在研究一个简单网络上的图遍历算法,我想用多处理来运行它,因为当我在整个网络上扩展它时,它需要大量的I/O绑定调用。简单的版本运行非常快:
already_seen = {}
already_seen_get = already_seen.get
GH_add_node = GH.add_node
GH_add_edge = GH.add_edge
GH_has_node = GH.has_node
GH_has_edge = GH.has_edge
def graph_user(user, depth=0):
logger.debug("Searching for %s", user)
logger.debug("At depth %d", depth)
users_to_read = followers = following = []
if already_seen_get(user):
logging.debug("Already seen %s", user)
return None
result = [x.value for x in list(view[user])]
if result:
result = result[0]
following = result['following']
followers = result['followers']
users_to_read = set().union(following, followers)
if not GH_has_node(user):
logger.debug("Adding %s to graph", user)
GH_add_node(user)
for follower in users_to_read:
if not GH_has_node(follower):
GH_add_node(follower)
logger.debug("Adding %s to graph", follower)
if depth < max_depth:
graph_user(follower, depth + 1)
if GH_has_edge(follower, user):
GH[follower][user]['weight'] += 1
else:
GH_add_edge(user, follower, {'weight': 1})
它实际上比我的多处理版本快得多:
to_write = Queue()
to_read = Queue()
to_edge = Queue()
already_seen = Queue()
def fetch_user():
seen = {}
read_get = to_read.get
read_put = to_read.put
write_put = to_write.put
edge_put = to_edge.put
seen_get = seen.get
while True:
try:
logging.debug("Begging for a user")
user = read_get(timeout=1)
if seen_get(user):
continue
logging.debug("Adding %s", user)
seen[user] = True
result = [x.value for x in list(view[user])]
write_put(user, timeout=1)
if result:
result = result.pop()
logging.debug("Got user %s and result %s", user, result)
following = result['following']
followers = result['followers']
users_to_read = list(set().union(following, followers))
[edge_put((user, x, {'weight': 1})) for x in users_to_read]
[read_put(y, timeout=1) for y in users_to_read if not seen_get(y)]
except Empty:
logging.debug("Fetches complete")
return
def write_node():
users = []
users_app = users.append
write_get = to_write.get
while True:
try:
user = write_get(timeout=1)
logging.debug("Writing user %s", user)
users_app(user)
except Empty:
logging.debug("Users complete")
return users
def write_edge():
edges = []
edges_app = edges.append
edge_get = to_edge.get
while True:
try:
edge = edge_get(timeout=1)
logging.debug("Writing edge %s", edge)
edges_app(edge)
except Empty:
logging.debug("Edges Complete")
return edges
if __name__ == '__main__':
pool = Pool(processes=1)
to_read.put(me)
pool.apply_async(fetch_user)
users = pool.apply_async(write_node)
edges = pool.apply_async(write_edge)
GH.add_weighted_edges_from(edges.get())
GH.add_nodes_from(users.get())
pool.close()
pool.join()
我不明白的是为什么单进程版本要快得多。理论上,多处理版本应该同时读写。我怀疑队列中存在锁争用,这是导致速度减慢的原因,但我没有任何证据证明这一点。当我缩放fetch用户进程的数量时,它似乎运行得更快,但随后我在同步它们之间看到的数据时遇到了问题。所以我的一些想法是
这是一个很好的申请吗
多处理?我原来是
使用它是因为我想
以并行方式从数据库中提取。
在从同一队列读写时,如何避免资源争用?
我是否错过了一些明显的设计警告?
如何在读卡器之间共享一个查阅表格,以便不连续两次获取同一用户?
当增加获取进程的数量时,写入程序最终锁定。看起来写入队列没有写入,但读取队列已满。有没有比超时和异常处理更好的方法来处理这种情况?