2016-07-19 11 views
0

Ich habe eine Datei in Swift (Bluemix Objectstore) gespeichert und versuche von einer Python-Anwendung, die auf Spark läuft, in Bluemix darauf zuzugreifen.SwiftConfigurationException: Ungültiger Hostname beim Zugriff auf Swift von Bluemix Spark

Ich kann erfolgreich auf die Datei in einem Python-Notizbuch zugreifen, das in derselben Umgebung ausgeführt wird, aber der Zugriff schlägt fehl, wenn in einer übermittelten Python-Anwendung mit einem org.apache.hadoop.fs.swift.exceptions.SwiftConfigurationException: Invalid host name Fehler erfolgt wird.

Hier ist der Python-Code:

# Added to the Python job but not part of the notebook 

from pyspark import SparkContext, SparkConf 
from pyspark.sql import SQLContext 
sc = SparkContext() 
sqlContext = SQLContext(sc) 

# Verified in the notebook 

from pyspark.sql import SQLContext 
from pyspark.sql.types import * 
from pyspark.sql import Row 
from pyspark.mllib.regression import LabeledPoint 
from pyspark.sql.functions import udf 
from pyspark.mllib.linalg import Vectors 
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.param import Param, Params 
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel 
from pyspark.mllib.regression import LabeledPoint 
from pyspark.mllib.stat import Statistics 
from pyspark.ml.feature import OneHotEncoder, StringIndexer 
from pyspark.mllib.linalg import Vectors 
from pyspark.ml.feature import VectorAssembler 
import sys 
import numpy as np 
import pandas as pd 
import time 
import datetime 

def set_hadoop_config(credentials): 
    """This function sets the Hadoop configuration with given credentials, 
    so it is possible to access data using SparkContext""" 

    prefix = "fs.swift.service." + credentials['name'] 
    hconf = sc._jsc.hadoopConfiguration() 
    hconf.set(prefix + ".auth.url", credentials['auth_url']+'/v3/auth/tokens') 
    hconf.set(prefix + ".auth.endpoint.prefix", "endpoints") 
    hconf.set(prefix + ".tenant", credentials['project_id']) 
    hconf.set(prefix + ".username", credentials['user_id']) 
    hconf.set(prefix + ".password", credentials['password']) 
    hconf.setInt(prefix + ".http.port", 8080) 
    hconf.set(prefix + ".region", credentials['region']) 
    hconf.setBoolean(prefix + ".public", True) 

credentials = { 
    'auth_url':'https://identity.open.softlayer.com', 
    'project':'object_storage_bcc6ba38_7399_4aed_a47c_e6bcdc959163', 
    'project_id':'f26ba12177c44e59adbe243b430b3bf5', 
    'region':'dallas', 
    'user_id':'bb973e5a84da4fce8c62d95f2e1e5d19', 
    'domain_id':'bd9453b2e5e2424388e25677cd26a7cf', 
    'domain_name':'1062145', 
    'username':'admin_a16bbb9d8d1d051ba505b6e7e76867f61c9d1ac1', 
    'password':"""...""", 
    'filename':'2001-2008-merged.csv', 
    'container':'notebooks', 
    'tenantId':'s090-be5845bf9646f1-3ef81b4dcb61' 
} 
credentials['name'] = 'FlightDelay_demo2' 
set_hadoop_config(credentials) 
textFile = sc.textFile("swift://" + credentials['container'] + "." + credentials['name'] + credentials['filename']) 

textFileRDD=textFile.map(lambda x: x.split(',')) 

# Line that throws the error: 
header = textFileRDD.first() 

ich die Python antrag wie folgt Spark:

./spark-submit.sh \ 
    --vcap ./vcap.json \ 
    --deploy-mode cluster \ 
    --master https://spark.bluemix.net \ 
    /resources/FlightDelay_demo2.py 

Hier ist meine vcap.json:

{ 
    "credentials": { 
    "tenant_id": "s090-be5845bf9646f1-3ef81b4dcb61", 
    "tenant_id_full": "2f61d7ef-955a-41f2-9090-be5845bf9646_dd570054-525b-4659-9af1-3ef81b4dcb61", 
    "cluster_master_url": "https://spark.bluemix.net", 
    "instance_id": "2f61d7ef-955a-41f2-9090-be5845bf9646", 
    "tenant_secret": "...", 
    "plan": "ibm.SparkService.PayGoPersonal" 
    } 
} 

Hier ist der voller Fehler:

Traceback (most recent call last): 
    File "/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/workdir/spark-driver-b2f25278-f8b4-40e3-8d53-9e8a64228197/FlightDelay_demo2.py", line 94, in <module> 
    header = textFileRDD.first() 
    File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/rdd.py", line 1315, in first 
    File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/rdd.py", line 1267, in take 
    File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/rdd.py", line 2363, in getNumPartitions 
    File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__ 
    File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/sql/utils.py", line 45, in deco 
    File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value 
py4j.protocol.Py4JJavaError: An error occurred while calling o33.partitions. 
: org.apache.hadoop.fs.swift.exceptions.SwiftConfigurationException: Invalid host name 
    at org.apache.hadoop.fs.swift.util.SwiftUtils.validSchema(SwiftUtils.java:222) 
    at org.apache.hadoop.fs.swift.http.SwiftRestClient.<init>(SwiftRestClient.java:510) 
    at org.apache.hadoop.fs.swift.http.SwiftRestClient.getInstance(SwiftRestClient.java:1914) 
    at org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystemStore.initialize(SwiftNativeFileSystemStore.java:81) 
    at org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem.initialize(SwiftNativeFileSystem.java:129) 
    at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2596) 
    at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91) 
    at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630) 
    at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612) 
    at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370) 
    at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296) 
    at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:256) 
    at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228) 
    at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313) 
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199) 
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) 
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) 
    at scala.Option.getOrElse(Option.scala:120) 
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) 
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) 
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239) 
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237) 
    at scala.Option.getOrElse(Option.scala:120) 
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:237) 
    at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:64) 
    at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:46) 
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:95) 
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:55) 
    at java.lang.reflect.Method.invoke(Method.java:507) 
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) 
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) 
    at py4j.Gateway.invoke(Gateway.java:259) 
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) 
    at py4j.commands.CallCommand.execute(CallCommand.java:79) 
    at py4j.GatewayConnection.run(GatewayConnection.java:209) 
    at java.lang.Thread.run(Thread.java:785) 

Ich dachte, es relevant sein könnten, also hier, was die Spark-Konfiguration (sorted(sc._conf.getAll())) sieht aus wie in der Python-Anwendung:

[(u'spark.app.name', u'/resources/FlightDelay_demo2.py'), 
(u'spark.driver.extraClassPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'), 
(u'spark.driver.extraLibraryPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'), 
(u'spark.eventLog.dir', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/events'), 
(u'spark.eventLog.enabled', u'true'), 
(u'spark.executor.extraClassPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'), 
(u'spark.executor.extraLibraryPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'), 
(u'spark.executor.instances', u'2'), 
(u'spark.executor.memory', u'1024m'), 
(u'spark.files', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/128249272b758216b946308d5f6ea43ca033a85a/FlightDelay_demo2.py'), 
(u'spark.files.useFetchCache', u'false'), 
(u'spark.master', u'spark://yp-spark-dal09-env5-0020:7083'), 
(u'spark.rdd.compress', u'True'), 
(u'spark.serializer.objectStreamReset', u'100'), 
(u'spark.service.hashed_tenant_id', u'25vT74pNDbLRr98mnGe81k9FmiS9NRiyELS04g=='), 
(u'spark.service.plan_name', u'ibm.SparkService.PayGoPersonal'), 
(u'spark.service.spark_version', u'1.6.0'), 
(u'spark.shuffle.service.port', u'7340'), 
(u'spark.submit.pyFiles', u'null'), 
(u'spark.ui.port', u'0')] 

Antwort

0

Es gibt zwei Probleme, die sich sowohl auf die rasche URL:

  1. credentials['name'] darf keine Zeichen enthalten, die in einem Hostnamen wie Unterstrichen unzulässig sind. Entfernen Sie den Unterstrich wie folgt:

    credentials['name'] = 'FlightDelayDemo2' 
    
  2. Es fehlt Schrägstrich zwischen dem Hostnamen und den Dateinamen. Es muss hinzugefügt werden:

    textFile = sc.textFile("swift://" + credentials['container'] + "." + credentials['name'] + "/" + credentials['filename']) 
    

Da ich meine eigene Frage bin zu antworten, ich, dass ein anderer Unbekannter Fehler erwähnen möchte, dass Sie sehen können, während das Debuggen etwas ähnelt:

pyspark.sql.utils.IllegalArgumentException: u'Host name may not be null'