Create single row dataframe from list of list pyspark
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a pyspark dataframe
I already use dataframe = SQLContext.createDataFrame(data, ['features'])
, but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark spark-dataframe
add a comment |
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a pyspark dataframe
I already use dataframe = SQLContext.createDataFrame(data, ['features'])
, but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark spark-dataframe
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
add a comment |
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a pyspark dataframe
I already use dataframe = SQLContext.createDataFrame(data, ['features'])
, but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark spark-dataframe
I have a data like this data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
I want to create a pyspark dataframe
I already use dataframe = SQLContext.createDataFrame(data, ['features'])
, but I always get
+--------+---+
|features| _2|
+--------+---+
| 1.1|1.2|
| 1.3|1.4|
| 1.5|1.6|
+--------+---+
how can I get result like below?
+----------+
|features |
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
python apache-spark pyspark spark-dataframe
python apache-spark pyspark spark-dataframe
asked Feb 12 '18 at 11:08
Yanfa Adi PutraYanfa Adi Putra
629
629
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
add a comment |
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11
add a comment |
3 Answers
3
active
oldest
votes
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f48745029%2fcreate-single-row-dataframe-from-list-of-list-pyspark%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
3 Answers
3
active
oldest
votes
3 Answers
3
active
oldest
votes
active
oldest
votes
active
oldest
votes
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
add a comment |
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
add a comment |
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
I find it's useful to think of the argument to createDataFrame()
as a list of tuples where each entry in the list corresponds to a row in the DataFrame and each element of the tuple corresponds to a column.
You can get your desired output by making each element in the list a tuple:
data = [([1.1, 1.2],), ([1.3, 1.4],), ([1.5, 1.6],)]
dataframe = sqlCtx.createDataFrame(data, ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
Or if changing the source is cumbersome, you can equivalently do:
data = [[1.1, 1.2], [1.3, 1.4], [1.5, 1.6]]
dataframe = sqlCtx.createDataFrame(map(lambda x: (x, ), data), ['features'])
dataframe.show()
#+----------+
#| features|
#+----------+
#|[1.1, 1.2]|
#|[1.3, 1.4]|
#|[1.5, 1.6]|
#+----------+
edited Feb 12 '18 at 17:04
answered Feb 12 '18 at 16:19
paultpault
14.4k31947
14.4k31947
add a comment |
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
add a comment |
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
You need a map
function to convert the tuples
to array
and use it in createDataFrame
dataframe = sqlContext.createDataFrame(sc.parallelize(data).map(lambda x: [x]), ['features'])
You should get as you desire
+----------+
| features|
+----------+
|[1.1, 1.2]|
|[1.3, 1.4]|
|[1.5, 1.6]|
+----------+
answered Feb 12 '18 at 11:23
Ramesh MaharjanRamesh Maharjan
27k52047
27k52047
add a comment |
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
add a comment |
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
You should use the Vector Assembler function, from your code I guess you are doing this to train a machine learning model, and vector assembler works the best for that case. You can also add the assembler in the pipeline.
assemble_feature=VectorAssembler(inputCol=data.columns,outputCol='features')
pipeline=Pipeline(stages=[assemble_feature])
pipeline.fit(data).transform(data)
answered Feb 12 '18 at 12:04
pratiklodhapratiklodha
686718
686718
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f48745029%2fcreate-single-row-dataframe-from-list-of-list-pyspark%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
You can create a schema and provide while creating a dataframe
– Shankar Koirala
Feb 12 '18 at 11:11