How to Convert Column With String Type to Int Form in Pyspark Data Frame

Looking to convert String Column to Integer Column in PySpark. What happens to strings that can't be converted?

Values which cannot be cast are set to null, and the column will be considered a nullable column of that type. Here's a simple example:

from pyspark import SQLContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.getOrCreate()

sql_context = SQLContext(spark.sparkContext)

df = sql_context.createDataFrame([("1",),
("2",),
("3",),
("4",),
("hello world",)], schema=['id'])

print(df.show())

df = df.withColumn("id", F.col("id").astype(IntegerType()))

print(df.show())

Output:

+-----------+
| id|
+-----------+
| 1|
| 2|
| 3|
| 4|
|hello world|
+-----------+

+----+
| id|
+----+
| 1|
| 2|
| 3|
| 4|
|null|
+----+

And to verify the schema is correct:

print(df.printSchema())

Output:

None
root
|-- id: integer (nullable = true)

Hope this helps!

Pyspark convert a Column containing strings into list of strings and save it into the same column

Just run split function

import pyspark.sql.functions as f


df = df.withColumn('Col2', f.split('Col2', ' '))

Convert string type column to struct and unzip the column using PySpark

Here is my try with spark built in functions.

Idea here is first create 2 columns with customer,purchase as values and other values in another column, to get these columns I used split then explode.

Once we got customer,purchase values then groupBy+Pivot to pivot the data finally split the columns to get array.

Example:

inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)

from pyspark.sql.functions import *

inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",split(col("customer"),",")).\
withColumn("purchase",split(col("purchase"),",")).\
show()

#+------+--------+-----------------+
#|rowNum|customer| purchase|
#+------+--------+-----------------+
#| 200| [doe]| null|
#| 100| [john]|[abc, mno, xyz]|
#+------+--------+-----------------+

UPDATE:

inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)

from pyspark.sql.functions import *

inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",split(regexp_replace(col("col"),"[\[|\]|\(|\)]",""),"',")).\
withColumn("new1",regexp_replace(trim(element_at(col("newCol1"),1)),"[']","")).\
withColumn("new2",regexp_replace(trim(element_at(col("newCol1"),2)),"[']","")).\
groupby("rowNum").\
pivot("new2").\
agg(first(col("new1"))).\
withColumn("customer",col("customer")).\
withColumn("purchase",col("purchase")).\
show()

#+------+--------+-------------+
#|rowNum|customer| purchase|
#+------+--------+-------------+
#| 200| doe| null|
#| 100| john|abc, mno, xyz|
#+------+--------+-------------+

UPDATE2:

inputdf = spark.createDataFrame(
[
("100", "[('john', 'customer'), ('abc, mno, xyz', 'purchase'), ('abc123', 'purchase')]"),
("200", "[('doe', 'customer')]"),
],
['rowNum', 'infoCol']
)

from pyspark.sql.functions import *


inputdf.withColumn("newcol", split(col("infoCol"), "\),")).\
selectExpr("explode(newcol)","rowNum").\
withColumn("newCol1",expr("""transform(split(regexp_replace(col,"[\[|\]|\(|\)]",""),"',"),x -> regexp_replace(trim(x),"[']",""))""")).\
withColumn("new1",regexp_replace(element_at(col("newCol1"),-1),"[\]]","")).\
withColumn("new2",array_except(col("newCol1"),array(lit('purchase'),lit('customer'),lit('purchase]'),lit('customer]')))).\
withColumn("new2",expr("""transform(new2,x -> concat("'",regexp_replace(x,"[\\\\[]",""),"'"))""")).\
drop(*['col','newCol1']).\
groupby("new1","rowNum").agg(flatten(collect_list(col("new2"))).alias("new2")).\
groupby("rowNum").pivot("new1").agg(first(col("new2"))).\
show(10,False)

#+------+--------+---------------------------+
#|rowNum|customer|purchase |
#+------+--------+---------------------------+
#|200 |['doe'] |null |
#|100 |['john']|['abc, mno, xyz', 'abc123']|
#+------+--------+---------------------------+

how to cast all columns of dataframe to string

Just:

from pyspark.sql.functions import col

table = spark.sql("table")

table.select([col(c).cast("string") for c in table.columns])


Related Topics



Leave a reply



Submit