problem One
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
Exception
happened
during
processing
of
request
from
(
Traceback
(
most
recent
call
last
)
:
File
"/usr/lib/python2.7/SocketServer.py"
,
line
295
,
in
_handle_request_noblock
'127.0.0.1'
,
48246
)
self
.
process_request
(
request
,
client_address
)
File
"/usr/lib/python2.7/SocketServer.py"
,
line
321
,
in
process_request
self
.
finish_request
(
request
,
client_address
)
File
"/usr/lib/python2.7/SocketServer.py"
,
line
334
,
in
finish_request
self
.
RequestHandlerClass
(
request
,
client_address
,
self
)
File
"/usr/lib/python2.7/SocketServer.py"
,
line
649
,
in
__init__
self
.
handle
(
)
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/pyspark/accumulators.py"
,
line
235
,
in
handle
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
--
num_updates
=
read_int
(
self
.
rfile
)
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/pyspark/serializers.py"
,
line
545
,
in
read_int
raise
EOFError
EOFError
py4j
.
java_gateway
:
ERROR
Error
while
sending
or
receiving
.
Traceback
(
most
recent
call
last
)
:
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py"
,
line
479
,
in
send_command
raise
Py4JError
(
"Answer from Java side is empty"
)
Py4JError
:
Answer
from
Java
side
is
empty
py4j
.
java_gateway
:
ERROR
Error
while
sending
or
receiving
.
Traceback
(
most
recent
call
last
)
:
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py"
,
line
479
,
in
send_command
raise
Py4JError
(
"Answer from Java side is empty"
)
Py4JError
:
Answer
from
Java
side
is
empty
py4j
.
java_gateway
:
ERROR
An
error
occurred
while
trying
to
connect
to
the
Java
server
Traceback
(
most
recent
call
last
)
:
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py"
,
line
425
,
in
start
self
.
socket
.
connect
(
(
self
.
address
,
self
.
port
)
)
File
"/usr/lib/python2.7/socket.py"
,
line
224
,
in
meth
return
getattr
(
self
.
_sock
,
name
)
(
*
args
)
error
:
[
Errno
111
]
Connection
refused
py4j
.
java_gateway
:
ERROR
An
error
occurred
while
trying
to
connect
to
the
Java
server
Traceback
(
most
recent
call
last
)
:
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py"
,
line
425
,
in
start
self
.
socket
.
connect
(
(
self
.
address
,
self
.
port
)
)
File
"/usr/lib/python2.7/socket.py"
,
line
224
,
in
meth
return
getattr
(
self
.
_sock
,
name
)
(
*
args
)
error
:
[
Errno
111
]
Connection
refused
Traceback
(
most
recent
call
last
)
:
File
"/home/zhmi/Pycharm Project/Applications-of-Machine-Learning/sms_spam_classification/class_filter_and_import _data_to_database.py"
,
line
150
,
in
<
module
>
File
"/home/zhmi/Pycharm Project/Applications-of-Machine-Learning/sms_spam_classification/class_filter_and_import _data_to_database.py"
,
line
87
,
in
stop
self
.
sc
.
stop
(
)
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/pyspark/context.py"
,
line
339
,
in
stop
self
.
_jsc
.
stop
(
)
|
这个问题不是很明白,但是当我把rdd 的数据量变小,从一个rdd容纳80 万条数据变为 一个rdd 容 纳 10 万条数据时,情况好了很多,有时候出现这个问题是在程序处理53万条数据的时候出现,估 计可能是我的电脑配置跟不上,处理不了这么多数据了……
problem Two
1
2
3
4
5
6
7
8
9
|
File
"/home/zhmi/spark/spark-1.5.1-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/serializers.py"
,
line
263
,
in
dump_stream
vs
=
list
(
itertools
.
islice
(
iterator
,
batch
)
)
File
"/home/zhmi/Pycharm Project/Applications-of-Machine-Learning/sms_spam_classification/class_filter_and_import _data_to_database.py"
,
line
92
,
in
<
lambda
>
.
map
(
lambda
x
:
list
(
jieba
.
cut
(
x
)
)
)
File
"/usr/local/lib/python2.7/dist-packages/jieba/__init__.py"
,
line
276
,
in
cut
sentence
=
strdecode
(
sentence
)
File
"/usr/local/lib/python2.7/dist-packages/jieba/_compat.py"
,
line
28
,
in
strdecode
sentence
=
sentence
.
decode
(
'utf-8'
)
AttributeError
:
'list'
object
has
no
attribute
'decode'
|
正确代码改为:list(jieba.cut(x[0])) , 因为jieba.cut(x)的结果是一个迭代器,我要把jieba.cut(x),也就是我们的分词结果存在一个rdd 里面,需要用list(jieba.cut(x)) 强制类型装换成list, 而传入jieba.cut(x)的参数应该是字符串,在我的程序中,x 是list型的,list[0] 取出我想要做分词的字符串。