How to Get Line Count of a Large File Cheaply in Python

How to get line count of a large file cheaply in Python?

You can't get any better than that.

After all, any solution will have to read the entire file, figure out how many \n you have, and return that result.

Do you have a better way of doing that without reading the entire file? Not sure... The best solution will always be I/O-bound, best you can do is make sure you don't use unnecessary memory, but it looks like you have that covered.

How to get line count of a large file cheaply in Python?

You can't get any better than that.

After all, any solution will have to read the entire file, figure out how many \n you have, and return that result.

Do you have a better way of doing that without reading the entire file? Not sure... The best solution will always be I/O-bound, best you can do is make sure you don't use unnecessary memory, but it looks like you have that covered.

(Python) Counting lines in a huge (>10GB) file as fast as possible

Ignacio's answer is correct, but might fail if you have a 32 bit process.

But maybe it could be useful to read the file block-wise and then count the \n characters in each block.

def blocks(files, size=65536):
while True:
b = files.read(size)
if not b: break
yield b

with open("file", "r") as f:
print sum(bl.count("\n") for bl in blocks(f))

will do your job.

Note that I don't open the file as binary, so the \r\n will be converted to \n, making the counting more reliable.

For Python 3, and to make it more robust, for reading files with all kinds of characters:

def blocks(files, size=65536):
while True:
b = files.read(size)
if not b: break
yield b

with open("file", "r",encoding="utf-8",errors='ignore') as f:
print (sum(bl.count("\n") for bl in blocks(f)))

How to get a line count of a file in Python

import sys
file = sys.argv[1]

with open(file, "r") as f:
num_lines = len(f.readlines())

Count lines in very large file where line size is fixed

Since each row has a fixed number of characters, just get the file's size in bytes with os.path.getsize, subtract the length of the header, then divide by the length of each row. Something like this:

import os

file_name = 'TickStory/EURUSD.csv'

len_head = len('Timestamp,Bid price\n')
len_row = len('2012-01-01 22:00:36.416,1.29368\n')

size = os.path.getsize(file_name)

print((size - len_head) / len_row + 1)

This assumes all characters in the file are 1 byte.

Is there an efficient way how to get line N of a large file in python?

How does islice() perform on your file?

from itertools import islice

def read_n_line(filename, line_num, encoding='utf-8'):
with open(filename, encoding=encoding) as f:
return next(islice(f, line_num - 1, line_num))

Read nth line from file efficiently in Python

For sufficiently large files, it may be more efficient to use a Numba-based approach:

import numba as nb


@nb.njit
def process(
block,
n,
last_nl_pos,
nl_pos,
nl_count,
offset,
nl=ord("\n")
):
nl = ord("\n")
for i, c in enumerate(block, offset):
if c == nl:
found = True
last_nl_pos = nl_pos
nl_pos = i
nl_count += 1
if nl_count == n:
break
return last_nl_pos, nl_pos, nl_count


def read_nth_line_nb(
filepath: str,
n: int,
encoding="utf-8",
size=2 ** 22, # 4 MiB
) -> str:
with open(filepath, "rb") as file_obj:
last_nl_pos = nl_pos = -1
nl_count = -1
offset = 0
while True:
block = file_obj.read(size)
if block:
last_nl_pos, nl_pos, nl_count = process(
block, n, last_nl_pos, nl_pos, nl_count, offset
)
offset += size
if nl_count == n:
file_obj.seek(last_nl_pos + 1)
return file_obj.read(nl_pos - last_nl_pos).decode(encoding)
else:
return

This essentially processes the file in blocks, keeping tracks of how many and where the new lines are (and how far is the block on the file).

For comparison I use the itertools.islice() approach:

import itertools


def read_nth_line_isl(filepath: str, n: int, encoding="utf-8") -> str:
with open(filepath, "r", encoding=encoding) as file_obj:
return next(itertools.islice(file_obj, n, n + 1), None)

as well as the naïve looping:

def read_nth_line_loop(filepath: str, n: int, encoding="utf-8") -> str:
with open(filepath, "r", encoding=encoding) as file_obj:
for i, line in enumerate(file_obj):
if i == n:
return line
return None

Assuming some files were created with the following:

import random
import string


def write_some_file(filepath: str, n: int, m: int = 10, l: int = 100, encoding="utf-8") -> None:
with open(filepath, "w", encoding=encoding) as file_obj:
for i in range(n):
line = "".join(random.choices(string.ascii_letters, k=random.randrange(m, l)))
file_obj.write(f"{i:0{k}d} - {line}\n")


k = 9
for i in range(1, k):
n_max = 10 ** i
print(n_max)
write_some_file(f"test{n_max:0{k}d}.txt", n_max)

It is possible to test that they all give the same result:

funcs = read_nth_line_isl, read_nth_line_loop, read_nth_line_nb
k = 9
n_max = 10 ** 5
filename = f"test{n_max:0{k}d}.txt"
for func in funcs:
print(f"{func.__name__:>20} {func(filename, n_max - 1)!r}")
# read_nth_line_isl '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
# read_nth_line_loop '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'
# read_nth_line_nb '000099999 - sWBnniKkpROZYlqfFLbSttEwYCjXfhQSapkkqxjePpGbobJzgaJTCOCSyHQEcLppZ\n'

The timings can be computed with:

k = 9
timings = {}
for i in range(1, k - 1):
n_max = 10 ** i
filename = f"test{n_max:0{k}d}.txt"
print(filename)
timings[i] = []
base = funcs[0](filename, n_max - 1)
for func in funcs:
res = func(filename, n_max - 1)
is_good = base == res
if i < 6:
timed = %timeit -r 12 -n 12 -q -o func(filename, n_max - 1)
else:
timed = %timeit -r 1 -n 1 -q -o func(filename, n_max - 1)
timing = timed.best * 1e3
timings[i].append(timing if is_good else None)
print(f"{func.__name__:>24} {is_good!s:5} {timing:10.3f} ms")

and plotted with:

import pandas as pd
import matplotlib.pyplot as plt


df = pd.DataFrame(data=timings, index=[func.__name__ for func in funcs]).transpose()
df.plot(marker='o', xlabel='log₁₀(Input Size) / #', ylabel='Best timing / µs', figsize=(6, 4), logy=True)
fig = plt.gcf()
fig.patch.set_facecolor('white')

to obtain:

bm

Indicating the Numba-based approach to be marginally faster (some 5-15%) for sufficiently large inputs (above 10⁵).

How to read text file in python after that write something on same file

This code may helps you.

with open('result.txt','r') as file:
data = file.readlines() # return list of all the lines in the text file.

for a in range(len(data)): # loop through all the lines.
if 'person' in data[a] and 'tv' in data[a]: # Check if person and tv in the same line.
data[a] = data[a].replace('\n','') + ' status : High\n'
elif 'person' in data[a] and 'laptop' in data[a]:# Check if person and laptop in the same line.
data[a] = data[a] + ' status : Low\n'

with open('result.txt','w') as file:
file.writelines(data) # write lines to the file.


Related Topics



Leave a reply



Submit