Convert csv file to pipe delimited file in Python
This does what I think you want:
import csv
with open('C:/Path/InputFile.csv', 'rb') as fin, \
open('C:/Path/OutputFile.txt', 'wb') as fout:
reader = csv.DictReader(fin)
writer = csv.DictWriter(fout, reader.fieldnames, delimiter='|')
writer.writeheader()
writer.writerows(reader)
Convert csv file which is pipe delimited to JSON format
Just Pandas won't help you here. (Happily, you also don't need Pandas here.)
import csv, io, ast, json
# Using a `StringIO` here instead of reading from a file
# (but since `StringIO`s are file-like, you can substitute
# an `open()` call here.)
data = io.StringIO(
"""
SetrecordID|SetMatchScore|Pairs
100,101,102|90|"100-101,40","100-102,80","101-102,90"
103,104,105|80|"103-104,60","103-105,90","104-105,90"
106,107,108|65|"106-107,55","106-108,60","107-108,80"
109,110,111|95|"109-110,85","109-111,100","110-111,100"
""".strip()
)
rows = []
for row in csv.DictReader(data, delimiter="|", quoting=csv.QUOTE_NONE):
pairs = [pair.split(",", 1) for pair in ast.literal_eval(row["Pairs"])]
row["Pairs"] = [
{f"Pair{x}": key, "matchscore": int(val)}
for x, (key, val) in enumerate(pairs, 1)
]
row["SetrecordID"] = row["SetrecordID"].split(",")
rows.append(row)
with open("data.json", "w") as outf:
json.dump(rows, outf, indent=2)
will ingest that data to a dicts you can work with (or just output to a JSON file):
{'SetrecordID': ['100', '101', '102'], 'SetMatchScore': '90', 'Pairs': [{'Pair1': '100-101', 'matchscore': 40}, {'Pair2': '100-102', 'matchscore': 80}, {'Pair3': '101-102', 'matchscore': 90}]}
{'SetrecordID': ['103', '104', '105'], 'SetMatchScore': '80', 'Pairs': [{'Pair1': '103-104', 'matchscore': 60}, {'Pair2': '103-105', 'matchscore': 90}, {'Pair3': '104-105', 'matchscore': 90}]}
{'SetrecordID': ['106', '107', '108'], 'SetMatchScore': '65', 'Pairs': [{'Pair1': '106-107', 'matchscore': 55}, {'Pair2': '106-108', 'matchscore': 60}, {'Pair3': '107-108', 'matchscore': 80}]}
{'SetrecordID': ['109', '110', '111'], 'SetMatchScore': '95', 'Pairs': [{'Pair1': '109-110', 'matchscore': 85}, {'Pair2': '109-111', 'matchscore': 100}, {'Pair3': '110-111', 'matchscore': 100}]}
Convert a folder of pipe-delimited text files to CSV in Python
Change the open statement to:
with open(file, "r", encoding="utf-8") as f:
This will open the file in text mode, as opposed to binary mode, and the encoding allows you to read non-ASCII content
How to convert pipe delimited to CSV or JSON
Finally found the solution.
I tested it on a 5GB file although slow it still works. It imports all data from a pipe delimited txt file to MongoDB.
import csv
import json
from pymongo import MongoClient
url_mongo = "mongodb://localhost:27017"
client = MongoClient(url_mongo)
db = client.Office
customer = db.Customer
jsonArray = []
file_txt = "Text.txt"
rowcount = 0
with open(file_txt, "r") as txt_file:
csv_reader = csv.DictReader(txt_file, dialect="excel", delimiter="|", quoting=csv.QUOTE_NONE)
for row in csv_reader:
rowcount += 1
jsonArray.append(row)
for i in range(rowcount):
jsonString = json.dumps(jsonArray[i], indent=1, separators=(",", ":"))
jsonfile = json.loads(jsonString)
customer.insert_one(jsonfile)
print("Finished")
Thank You All for your Ideas
how to convert pipe delimited text file to csv file in pyspark?
input=sc.textFile("hdfs://host/user/data/file1.txt") ## it is RDD
You can use this to convert rdd to DataFrame. We have "sc" as SparkContext.
toDF()
input.toDF()
You have to infer the Schema for this and have to map or split it using '|', Instead. You can either import HiveContext or you can use sqlContext defined as SQLContext. By using, sqlContext spark will create a DataFrame for the file you specify.
# import
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
# Then, you can use the com.databricks.spark.csv.
df = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true', delimiter='|') \
.load('hdfs://host/user/data/file1.txt')
Second Approach (after you convert to .csv), you can
df = sqlContext.read \
.format('com.databricks.spark.csv') \
.options(header='true', delimiter=',') \ # you can use a ',' here
.load('hdfs://host/user/data/file1.csv')
- format(): Specify the format you want to read the file.
- options(): Allow you to specify header and you can specify delimiter.
- load(): Loads the file for given path.
For more information, refer:
- com.databricks.spark.csv: https://github.com/databricks/spark-csv
Related Topics
Adding Months to a Pandas Object in Python
Filtering Dataframe Using the Length of a Column
How to Split by Commas That Are Not Within Parentheses
Package Only Binary Compiled .So Files of a Python Library Compiled With Cython
How to Use Ffmpeg in a Python Function
Most Efficient Way to Forward-Fill Nan Values in Numpy Array
Calculate Final Letter Grade in Python Given 4 Test Scores
Best Practices for Adding .Gitignore File for Python Projects
Get the Last Sunday and Saturday'S Date in Python
Finding the Two Closest Numbers in a List Using Sorting
How to Set the Default Python Path for Anaconda on Linux
Replace Single Quote to Double Quote Python Pandas Dataframe
Key Error: None of [Int64Index...] Dtype='Int64] Are in the Columns
Django Model Choice Option as a Multi Select Box
Inserting a Python Datetime.Datetime Object into MySQL
Split List into Lists Based on a Character Occurring Inside of an Element
I Need to Code a 1 22 333 4444 Pattern in Python With While Loops