Python抓取数据并发送给kafka

 

 

#coding=UTF-8
import requests
import  json
from lxml import  etree
from pykafka import KafkaClient
import sys
import schedule
import time

class Aiqiyi():
    def __init__(self):
        myhosts = "127.0.0.1:9092"
        client = KafkaClient(hosts=myhosts)
        self.topic = client.topics['test'.encode()]
        print(client.topics)

def sendMessage(self, mydict):
        with self.topic.get_sync_producer(delivery_reports=True) as producer:
            data = json.dumps(mydict)
            producer.produce(bytes(data, encoding='utf-8'))
def  getProperti():

        headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        url = 'http://list.iqiyi.com/www/25/-------------4-1-2-iqiyi-1-.html'
        sourceHtml = requests.get(url, headers=headers).content.decode('utf-8')
        selector = etree.HTML(sourceHtml)
        links = selector.xpath('/html/body/div[3]/div/div/div[3]/div/ul[@class="site-piclist site-piclist-180101 site-piclist-auto"]/li[1]')
        for link in links:
                    #节目链接
                    href = link.xpath("//div['site-piclist_info']/div[1]/p/a/@href")
                    #节目名称
                    name = link.xpath("//div['site-piclist_info']/div[1]/p/a/text()")
                    #节目图片
                    img = link.xpath("//div[@class='site-piclist_pic']/a/img/@src")
                    #上映时间
                    time = link.xpath("//div['site-piclist_info']/div[@class='role_info']/text()")

                    multi_list = map(list, zip(name, href, img,time))

                    for i in multi_list:
                        jso = {}
                        jso["name"] = str(i[0].strip())
                        jso["href"] = str(i[1].strip())
                        jso["img"]  = str(i[2].strip())
                        jso["time"] = str(i[3].replace("\r\n","").strip())
                        df = [('a111', 'a2', 'a3'), ('b111', 'b2', 'b3'), ('c111', 'c2', 'c3')]
                        sendMessage(df)
                        #print(jso)
def job():
    print("I'm working...")

if __name__ == "__main__":
    schedule.every(1).minutes.do(getProperti)
while True:
    schedule.run_pending()
    time.sleep(1)

 

 

 

#coding=UTF-8

from pykafka import KafkaClient
import json
import sys

class pythonSendkafka:

    def clien(self):
        myhosts = "127.0.0.1:9092"
        client = KafkaClient(hosts=myhosts)
        self.topic = client.topics['test'.encode()]
        print(client.topics)

    def sendMessage(self, mydict):
        with self.topic.get_sync_producer(delivery_reports=True) as producer:
                data = json.dumps(mydict)
                producer.produce(bytes(data, encoding='utf-8'))
df = [('a111', 'a2', 'a3'), ('b111', 'b2', 'b3'), ('c111', 'c2', 'c3')]
py = pythonSendkafka()
py.clien()
py.sendMessage(df)

  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值