How to Get All the Contiguous Substrings of a String in Python

How To Get All The Contiguous Substrings Of A String In Python?

The only improvement I could think of is, to use list comprehension like this

def get_all_substrings(input_string):
  length = len(input_string)
  return [input_string[i:j+1] for i in xrange(length) for j in xrange(i,length)]

print get_all_substrings('abcde')

The timing comparison between, yours and mine

def get_all_substrings(string):
  length = len(string)
  alist = []
  for i in xrange(length):
    for j in xrange(i,length):
      alist.append(string[i:j + 1]) 
  return alist

def get_all_substrings_1(input_string):
  length = len(input_string)
  return [input_string[i:j + 1] for i in xrange(length) for j in xrange(i,length)]

from timeit import timeit
print timeit("get_all_substrings('abcde')", "from __main__ import get_all_substrings")
# 3.33308315277
print timeit("get_all_substrings_1('abcde')", "from __main__ import get_all_substrings_1")
# 2.67816185951

How to find contiguous substrings from a string in python

You can use itertools.groupby() for this:

from itertools import groupby

s = 'abccdddcce'
l1 = ["".join(g) for k, g in groupby(s)]
l2 = [a[:i+1] for a in l1 for i in range(len(a))]
print l2

Output:

['a', 'b', 'c', 'cc', 'd', 'dd', 'ddd', 'c', 'cc', 'e']

For larger input data, replace the Lists with Generators,

l1=()
l2=()

Is there a better way to find all the contiguous substrings of a string that appear in a given dictionary

Use Aho-Corasick or Rabin-Karp algorithms intended for this purpose:

It is a kind of dictionary-matching algorithm that locates elements of
a finite set of strings (the "dictionary") within an input text. It
matches all strings simultaneously

There are numerous Python implementations for these algorithms.

Complexity for Aho-Corasick searching is O(TextLength + AnswerLength), preprocessing O(n*σ), where n is overall length of all words in the dictionary, σ is alphabet size

For Rabin-Karp average time is O(TextLength + AnswerLength) too, but the worst time is O(TextLength * AnswerLength)

How to get every possible list of substrings from a string in Python?

I believe this is not strictly a duplicate, and I provide an exact solution to your problem.

Solution

For a given string of length n, we shall obtain a unique and valid partition of the string for every binary string of length (n-1). For example: The string "coconut" and the binary string "001010" corresponds to the partition: ['coc', 'on', 'ut'] and the binary string "100101" corresponds to: ['c','oco','nu','t'].

Thus we can get the full list of partitions as you require, by taking all ((2^(n-1))-1) different partitions corresponding to the different binary sequences.

Implementation

import itertools
def get_list_partitions(string):
  partitions = []
  
  #list of binary sequences
  binary_sequences = ["".join(seq) for seq in itertools.product("01", repeat=len(string)-1)]

  #go over every binary sequence (which represents a partition)
  for sequence in binary_sequences:
    partition = []
    
    #current substring, accumulates letters until it encounters "1" in the binary sequence
    curr_substring = string[0]
    for i, bit in enumerate(sequence):
      #if 0, don't partition. otherwise, add curr_substring to the current partition and set curr_substring to be the next letter
      if bit == '0':                        
        curr_substring = curr_substring + string[i+1]
      else:
        partition.append(curr_substring)
        curr_substring = string[i+1]  
    
    #add the last substring to the partition
    partition.append(curr_substring)

    #add partition to the list of partitions
    partitions.append(partition)  

  return partitions 

  
print(get_list_partitions("coconut"))

Find all common contiguous substrings of two strings in python

import string
s1 = 'Today is a good day, it is a good idea to have a walk.'
s2 = 'Yesterday was not a good day, but today is good, shall we have a walk?'
z=[]
s1=s1.translate(None, string.punctuation) #remove punctuation
s2=s2.translate(None, string.punctuation)
print s1
print s2
sw1=s1.lower().split()                   #split it into words
sw2=s2.lower().split()
print sw1,sw2
i=0
while i<len(sw1):          #two loops to detect common strings. used while so as to change value of i in the loop itself
    x=0
    r=""
    d=i
    #print r
    for j in range(len(sw2)):
        #print r
        if sw1[i]==sw2[j]:
            r=r+' '+sw2[j]                       #if string same keep adding to a variable
            x+=1
            i+=1
        else:
            if x>0:     # if not same check if there is already one in buffer and add it to result (here z)
                z.append(r)
                i=d
                r=""
                x=0
    if x>0:                                            #end case of above loop
        z.append(r)
        r=""
        i=d
        x=0
    i+=1 
    #print i
print list(set(z)) 

#O(n^3)

How to get all substrings & all character permutations of all substings?

import itertools

You can do, using itertools permutations & combinations:

def string_set(x):
    arr = [''.join(l) for i in range(len(x)) for l in itertools.combinations(x, i+1)]     
    reslist=[]   
    for eachpermut in arr:
        for each in [''.join(eachpermut) for eachpermut in list(itertools.permutations(eachpermut))]:
            reslist.append(each)
    return reslist

If you like everything in one line:

def string_set(x):
    return [each for eachpermut in [''.join(l) for i in range(len(x)) for l in itertools.combinations(x, i+1)] for each in [''.join(eachpermut) for eachpermut in list(itertools.permutations(eachpermut))]]

string_set('abc') returns:

['a',
 'b',
 'c',
 'ab',
 'ba',
 'ac',
 'ca',
 'bc',
 'cb',
 'abc',
 'acb',
 'bac',
 'bca',
 'cab',
 'cba']

If you don't want to import itertools, you can just use their permutations functions only:

def permutations(iterable, r=None):
    # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
    # permutations(range(3)) --> 012 021 102 120 201 210
    pool = tuple(iterable)
    n = len(pool)
    r = n if r is None else r
    if r > n:
        return
    indices = list(range(n))
    cycles = list(range(n, n-r, -1))
    yield tuple(pool[i] for i in indices[:r])
    while n:
        for i in reversed(range(r)):
            cycles[i] -= 1
            if cycles[i] == 0:
                indices[i:] = indices[i+1:] + indices[i:i+1]
                cycles[i] = n - i
            else:
                j = cycles[i]
                indices[i], indices[-j] = indices[-j], indices[i]
                yield tuple(pool[i] for i in indices[:r])
                break
        else:
            return

Then just do permutations instead of itertools.permutations in the above code.

Can do the same with combinations.

Getting all combinations of a string and its substrings

You can do this easily using itertools.combinations

>>> from itertools import combinations
>>> x = 'abc'
>>> [''.join(l) for i in range(len(x)) for l in combinations(x, i+1)]
['a', 'b', 'c', 'ab', 'ac', 'bc', 'abc']

If you want it in the reversed order, you can make the range function return its sequence in reversed order

>>> [''.join(l) for i in range(len(x),0,-1) for l in combinations(x, i)]
['abc', 'ab', 'ac', 'bc', 'a', 'b', 'c']

How to Get All the Contiguous Substrings of a String in Python