通过pyspark实现ip地址查询

机器内存不够,跑不动;先记录一波

# -*- coding: utf-8 -*-
# @Time    : 2019/12/16 20:05
# @Author  :

import os

from pyspark.sql import SparkSession
from pyspark import SparkConf

os.environ['PYSPARK_PYTHON'] = "/usr/bin/python3"

# 使用集群模式
MASTER = "spark://192.168.18.126:7077"
APPName = "ip_search"


# 255.255.255.255 0~255 256  2^8 8位2进制数
# 将ip转换为特殊的数字形式  223.243.0.0|223.243.191.255|  255 2^8
# ‭11011111‬
# 00000000
# 1101111100000000
# ‭        11110011‬
# 11011111111100110000000000000000
def ip_transform(ip):
    ips = ip.split(".")  # [223,243,0,0] 32位二进制数
    ip_num = 0
    for i in ips:
        ip_num = int(i) | ip_num << 8
    return ip_num


# 二分法查找ip对应的行的索引
def binary_search(ip_num, broadcast_value):
    start = 0
    end = len(broadcast_value) - 1
    while (start <= end):
        mid = int((start + end) / 2)
        if ip_num >= int(broadcast_value[mid][0]) and ip_num <= int(broadcast_value[mid][1]):
            return mid
        if ip_num < int(broadcast_value[mid][0]):
            end = mid
        if ip_num > int(broadcast_value[mid][1]):
            start = mid


def main():
    sc_conf = SparkConf()
    sc_conf.setMaster(MASTER)
    spark = SparkSession.builder.appName(APPName).getOrCreate()
    sc = spark.sparkContext

    city_id_rdd = sc.textFile("file:///root/ip.txt").map(lambda x: x.split("|")).map(
        lambda x: (x[2], x[3], x[13], x[14]))

    # 创建一个广播变量
    # 每一个Worker上会有多个task,每一个task 都需要这一个ip表
    # 可以通过广播变量, 通知当前worker上所有的task, 来共享这个数据,避免数据的多次复制,可以大大降低内存的开销
    city_broadcast = sc.broadcast(city_id_rdd.collect())
    dest_data = sc.textFile("file:///root/20090121000132.394251.http.format").map(
        lambda x: x.split("|")[1])

    # 根据取出对应的位置信息
    def get_pos(x):
        city_broadcast_value = city_broadcast.value

        # 根据单个ip获取对应经纬度信息
        def get_result(ip):
            ip_num = ip_transform(ip)
            index = binary_search(ip_num, city_broadcast_value)
            # ((纬度,精度),1)
            return ((city_broadcast_value[index][2], city_broadcast_value[index][3]), 1)

        x = map(tuple, [get_result(ip) for ip in x])
        return x

    # 类似map,但是map是一条一条传给里面函数
    # mapPartitions数据是一部分一部分传给函数的
    dest_rdd = dest_data.mapPartitions(lambda x: get_pos(x))  # ((纬度,精度),1)

    result_rdd = dest_rdd.reduceByKey(lambda a, b: a + b)
    print(result_rdd.collect())
    sc.stop()


if __name__ == '__main__':
    main()

ip.txt

1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.0.8.0|1.0.15.255|16779264|16781311|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.0.32.0|1.0.63.255|16785408|16793599|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.1.0.0|1.1.0.255|16842752|16843007|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.1.2.0|1.1.7.255|16843264|16844799|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.1.8.0|1.1.63.255|16844800|16859135|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.2.0.0|1.2.1.255|16908288|16908799|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.2.2.0|1.2.2.255|16908800|16909055|亚洲|中国|北京|北京|海淀|北龙中网|110108|China|CN|116.29812|39.95931
1.2.4.0|1.2.4.255|16909312|16909567|亚洲|中国|北京|北京||中国互联网信息中心|110100|China|CN|116.405285|39.904989
1.2.5.0|1.2.7.255|16909568|16910335|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.2.8.0|1.2.8.255|16910336|16910591|亚洲|中国|北京|北京||中国互联网信息中心|110100|China|CN|116.405285|39.904989
1.2.9.0|1.2.127.255|16910592|16941055|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.3.0.0|1.3.255.255|16973824|17039359|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.4.1.0|1.4.3.255|17039616|17040383|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.4.4.0|1.4.4.255|17040384|17040639|亚洲|中国|北京|北京|海淀|北龙中网|110108|China|CN|116.29812|39.95931
1.4.5.0|1.4.7.255|17040640|17041407|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
1.4.8.0|1.4.127.255|17041408|17072127|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178
1.8.0.0|1.8.255.255|17301504|17367039|亚洲|中国|北京|北京|海淀|北龙中网|110108|China|CN|116.29812|39.95931
1.10.0.0|1.10.7.255|17432576|17434623|亚洲|中国|广东|广州||电信|440100|China|CN|113.280637|23.125178

20090121000132.394251.http.format

20090121000132095572000|125.213.100.123|show.51.com|/shoplist.php?phpfile=shoplist2.php&style=1&sex=137|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Mozilla/4.0(Compatible Mozilla/4.0(Compatible-EmbeddedWB 14.59 http://bsalsa.com/ EmbeddedWB- 14.59  from: http://bsalsa.com/ )|http://show.51.com/main.php|
20090121000132124542000|117.101.215.133|www.jiayuan.com|/19245971|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler 4.0)|http://photo.jiayuan.com/index.php?uidhash=d1c3b69e9b8355a5204474c749fb76ef|__tkist=0; myloc=50%7C5008; myage=2009; PROFILE=14469674%3A%E8%8B%A6%E6%B6%A9%E5%92%96%E5%95%A1%3Am%3Aphotos2.love21cn.com%2F45%2F1b%2F388111afac8195cc5d91ea286cdd%3A1%3A%3Ahttp%3A%2F%2Fimages.love21cn.com%2Fw4%2Fglobal%2Fi%2Fhykj_m.jpg; last_login_time=1232454068; SESSION_HASH=8176b100a84c9a095315f916d7fcbcf10021e3af; RAW_HASH=008a1bc48ff9ebafa3d5b4815edd04e9e7978050; COMMON_HASH=45388111afac8195cc5d91ea286cdd1b; pop_1232093956=1232468896968; pop_time=1232466715734; pop_1232245908=1232469069390; pop_1219903726=1232477601937; LOVESESSID=98b54794575bf547ea4b55e07efa2e9e; main_search:14469674=%7C%7C%7C00; registeruid=14469674; REG_URL_COOKIE=http%3A%2F%2Fphoto.jiayuan.com%2Fshowphoto.php%3Fuid_hash%3D0319bc5e33ba35755c30a9d88aaf46dc%26total%3D6%26p%3D5; click_count=0%2C3363619
20090121000132406516000|117.101.222.68|gg.xiaonei.com|/view.jsp?p=389|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; CIBA)|http://home.xiaonei.com/Home.do?id=229670724|_r01_=1; __utma=204579609.31669176.1231940225.1232462740.1232467011.145; __utmz=204579609.1231940225.1.1.utmccn=(direct)
20090121000132581311000|115.120.36.118|tj.tt98.com|/tj.htm|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://www.tt98.com/|
20090121000132864647000|123.197.64.247|cul.sohu.com|/20071227/n254338813_22.shtml|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TheWorld)|http://cul.sohu.com/20071227/n254338813_22.shtml|ArticleTab=visit:1; IPLOC=unknown; SUV=0901080709152121; vjuids=832dd37a1.11ebbc5d590.0.b20f858f14e918; club_chat_ircnick=JaabvxC4aaacQ; spanel=%7B%22u%22%3A%22%22%7D; vjlast=1232467312,1232467312,30
20090121000133296729000|222.55.57.176|down.chinaz.com|/|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; TencentTraveler 4.0)||cnzz_a33219=0; vw33219=%3A18167791%3A; sin33219=http%3A//www.itxls.com/wz/wyfx/it.html; rtime=0; ltime=1232464387281; cnzz_eid=6264952-1232464379-http%3A//www.itxls.com/wz/wyfx/it.html
20090121000133331104000|123.197.66.93|www.pkwutai.cn|/down/downLoad-id-45383.html|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7)|http://www.baidu.com/s?tn=b1ank_pg&ie=gb2312&bs=%C3%C0%C6%BC%B7%FE%D7%B0%B9%DC%C0%ED%C8%ED%BC%FE&sr=&z=&cl=3&f=8&wd=%C6%C6%BD%E2%C3%C0%C6%BC%B7%FE%D7%B0%B9%DC%C0%ED%C8%ED%BC%FE&ct=0|
20090121000133446262000|115.120.12.157|v.ifeng.com|/live/|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; CIBA)|http://www.ifeng.com/|userid=1232466610953_4339; location=186; sclocationid=10002; vjuids=22644b162.11ef4bc1624.0.63ad06717b426; vjlast=1232466614,1232467297,13
20090121000133456256000|115.120.7.240|cqbbs.soufun.com|/3110502342~-1~2118/23004348_23004348.htm|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 2.0.50727; CIBA)||new_historysignlist=%u534E%u6DA6%u4E8C%u5341%u56DB%u57CE%7Chttp%3A//cqbbs.soufun.com/board/3110502342/%7C%7C%u9A8F%u9038%u7B2C%u4E00%u6C5F%u5CB8%7Chttp%3A//cqbbs.soufun.com/board/3110169184/%7C%7C%u793E%u533A%u4E4B%u661F%7Chttp%3A//cqbbs.soufun.com/board/sqzx/%7C%7C; SoufunSessionID=2y5xyr45kslc0zbdooqnoo55; viewUser=1; vjuids=-870e9088.11ee89aba57.0.be9c3d988def8; vjlast=1232263101,1232380806,11; new_viewtype=1; articlecolor=#000000; usersms_pop_type=1; articlecount=186; __utma=101868291.755195653.1232450942.1232450942.1232450942.1; __utmz=101868291.1232450942.1.1.utmccn=(referral)
20090121000133586141000|117.101.219.241|12.zgwow.com|/launcher/index.htm|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)||
20090121000133744103000|123.197.49.171|2.82yyy.com|/32/webpage/L/2.Html|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; QQDownload 1.7; TencentTraveler ; Maxthon; .NET CLR 1.1.4322)|http://2.82yyy.com/32/webpage/L/1.Html|cnzz_a998284=3; vw998284=%3A52225577%3A68566865%3A68566789%3A68566815%3A; sin998284=none; rtime=0; ltime=1232466017187; cnzz_eid=1870962-1232464084-; cnzz_a1021073=3; vw1021073=%3A34926533%3A; sin1021073=none; 61kkk=1,1232464210281
20090121000133757842000|117.101.213.104|game.7679.com|/scroll.php|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; CIBA)|http://game.7679.com/games/1021/|cnzz_a30008507=7; rtime=2; ltime=1232466389781; cnzz_eid=12877395-http%3A//apps.51.com/application.php%3Fapp_key%3D4a99277cca695a34ba39719399030076; 4a99277cca695a34ba39719399030076_user=tangqingqing33; 4a99277cca695a34ba39719399030076_session_key=1b203792173c71e961fd8cafdf011f9d; 4a99277cca695a34ba39719399030076_time=1232466378; 4a99277cca695a34ba39719399030076=cf9441753f0b3312fd76b18b68261287
20090121000134038848000|115.120.10.205|bf.bearcn.com|/user.asp?userid=9795|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; TencentTraveler ; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 1.1.4322; .NET CLR 2.0.50727)|http://bf.bearcn.com/Photo.asp?page=7|ASPSESSIONIDCATABTSR=MDIFCDPCMMGJBDEBMJCPCGHF; BearCN=viewid=20944; ASPSESSIONIDCAQDCQSR=OEDPCHPCOJIKCGECBIFLAOGI
20090121000134178887000|117.101.218.147|www.baidu.com|/|test||BAIDUID=4221AC111420E40EFA125AEC596813B7:FG=1
20090121000134259104000|115.120.17.80|www.sjshu.com|/bookdown/ShowSoftDown.asp?UrlID=1&SoftID=22222|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7; .NET CLR 2.0.50727)|http://www.sjshu.com/bookdown/200803/22222.shtml|ASPSESSIONIDQQASTRAD=CEJGLDLCADCNKJOPLAMEKDJJ; AJSTAT_ok_pages=3; AJSTAT_ok_times=1; ppad_cookie_0=1
20090121000134372468000|117.101.220.175|www.zhaodll.com|/dll/softdown.asp?softid=306|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)|http://www.zhaodll.com/dll/k/200607/306.html|ASPSESSIONIDQSTACDQD=HGCADNMCGHEGNPAFENHNDHKM; cnzz_a206791=0; vw206791=%3A60592519%3A; sin206791=http%3A//www.baidu.com/s%3Fwd%3Dksuser.dll%26tn%3Dyjhy_dg%26bar%3D; rtime=0; ltime=1232467247934; cnzz_eid=75465914-1232467247-http%3A//www.baidu.com/s%3Fwd%3Dksuser.dll%26tn%3Dyjhy_dg%26bar%3D
20090121000134389671000|123.197.66.12|video.baidu.com|/v?ct=301989888&rn=20&pn=0&db=0&s=8&word=%C9%AB%BC%B4%CA%C7%BF%D52%CC%F0%D0%D4%C9%AC%B0%AE|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)|http://web.gougou.com/bfs?search=%e8%89%b2%e5%8d%b3%e6%98%af%e7%a9%ba%32%e7%94%9c%e6%80%a7%e6%b6%a9%e7%88%b1&page=1&id=10000001&f=1&t=-1|BAIDUID=7AEB83E2A2E24FE200CE048A70DCBD9E:FG=1; BDSTAT=cd2b93848358eb180fb30f2442a7d933c895d143ad4bd11373f0820258afe324
20090121000134422762000|125.213.100.236|longma168.com|/al/468x60-1.htm|Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)|http://www.longwang.biz/pm.php?action=view&pmid=1985410|
20090121000134802198000|123.197.66.208|webim.51.com|/webim/main.php?to=|Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; CIBA)|http://my.51.com/webim/index.php?refer=/|
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Cocktail_py

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值