Strange String.unicodeScalars and CharacterSet behaviour
CharacterSet.capitalizedLetters
returns a character set containing the characters in Unicode General Category Lt aka "Letter, titlecase". That are
"Ligatures containing uppercase followed by lowercase letters (e.g., Dž, Lj, Nj, and Dz)" (compare Wikipedia: Unicode character property or
Unicode® Standard Annex #44 – Table 12. General_Category Values).
You can find a list here: Unicode Characters in the 'Letter, Titlecase' Category.
You can also use the code from
NSArray from NSCharacterset to dump the contents of the character
set:
extension CharacterSet {
func allCharacters() -> [Character] {
var result: [Character] = []
for plane: UInt8 in 0...16 where self.hasMember(inPlane: plane) {
for unicode in UInt32(plane) << 16 ..< UInt32(plane + 1) << 16 {
if let uniChar = UnicodeScalar(unicode), self.contains(uniChar) {
result.append(Character(uniChar))
}
}
}
return result
}
}
let characterSet = CharacterSet.capitalizedLetters
print(characterSet.allCharacters())
// ["Dž", "Lj", "Nj", "Dz", "ᾈ", "ᾉ", "ᾊ", "ᾋ", "ᾌ", "ᾍ", "ᾎ", "ᾏ", "ᾘ", "ᾙ", "ᾚ", "ᾛ", "ᾜ", "ᾝ", "ᾞ", "ᾟ", "ᾨ", "ᾩ", "ᾪ", "ᾫ", "ᾬ", "ᾭ", "ᾮ", "ᾯ", "ᾼ", "ῌ", "ῼ"]
What you probably want is CharacterSet.uppercaseLetters
which
Returns a character set containing the characters in Unicode General Category Lu and Lt.
Strange String.unicodeScalars and CharacterSet behaviour
CharacterSet.capitalizedLetters
returns a character set containing the characters in Unicode General Category Lt aka "Letter, titlecase". That are
"Ligatures containing uppercase followed by lowercase letters (e.g., Dž, Lj, Nj, and Dz)" (compare Wikipedia: Unicode character property or
Unicode® Standard Annex #44 – Table 12. General_Category Values).
You can find a list here: Unicode Characters in the 'Letter, Titlecase' Category.
You can also use the code from
NSArray from NSCharacterset to dump the contents of the character
set:
extension CharacterSet {
func allCharacters() -> [Character] {
var result: [Character] = []
for plane: UInt8 in 0...16 where self.hasMember(inPlane: plane) {
for unicode in UInt32(plane) << 16 ..< UInt32(plane + 1) << 16 {
if let uniChar = UnicodeScalar(unicode), self.contains(uniChar) {
result.append(Character(uniChar))
}
}
}
return result
}
}
let characterSet = CharacterSet.capitalizedLetters
print(characterSet.allCharacters())
// ["Dž", "Lj", "Nj", "Dz", "ᾈ", "ᾉ", "ᾊ", "ᾋ", "ᾌ", "ᾍ", "ᾎ", "ᾏ", "ᾘ", "ᾙ", "ᾚ", "ᾛ", "ᾜ", "ᾝ", "ᾞ", "ᾟ", "ᾨ", "ᾩ", "ᾪ", "ᾫ", "ᾬ", "ᾭ", "ᾮ", "ᾯ", "ᾼ", "ῌ", "ῼ"]
What you probably want is CharacterSet.uppercaseLetters
which
Returns a character set containing the characters in Unicode General Category Lu and Lt.
Checking CharacterSet for single UnicodeScalar yields strange behaviour
The source code to CharacterSet
is available, actually. The source for contains
is:
fileprivate func contains(_ member: Unicode.Scalar) -> Bool {
switch _backing {
case .immutable(let cs):
return CFCharacterSetIsLongCharacterMember(cs, member.value)
case .mutable(let cs):
return CFCharacterSetIsLongCharacterMember(cs, member.value)
}
}
So it basically just calls through to CFCharacterSetIsLongCharacterMember
. The source code for that is also available, although only for Yosemite (the versions for El Cap and Sierra both say "Coming Soon"). However, the Yosemite code seemed to match what I was seeing in the disassembly on Sierra. Anyway, the code for that looks like this:
Boolean CFCharacterSetIsLongCharacterMember(CFCharacterSetRef theSet, UTF32Char theChar) {
CFIndex length;
UInt32 plane = (theChar >> 16);
Boolean isAnnexInverted = false;
Boolean isInverted;
Boolean result = false;
CF_OBJC_FUNCDISPATCHV(__kCFCharacterSetTypeID, Boolean, (NSCharacterSet *)theSet, longCharacterIsMember:(UTF32Char)theChar);
__CFGenericValidateType(theSet, __kCFCharacterSetTypeID);
if (plane) {
CFCharacterSetRef annexPlane;
if (__CFCSetIsBuiltin(theSet)) {
isInverted = __CFCSetIsInverted(theSet);
return (CFUniCharIsMemberOf(theChar, __CFCSetBuiltinType(theSet)) ? !isInverted : isInverted);
}
isAnnexInverted = __CFCSetAnnexIsInverted(theSet);
if ((annexPlane = __CFCSetGetAnnexPlaneCharacterSetNoAlloc(theSet, plane)) == NULL) {
if (!__CFCSetHasNonBMPPlane(theSet) && __CFCSetIsRange(theSet)) {
isInverted = __CFCSetIsInverted(theSet);
length = __CFCSetRangeLength(theSet);
return (length && __CFCSetRangeFirstChar(theSet) <= theChar && theChar < __CFCSetRangeFirstChar(theSet) + length ? !isInverted : isInverted);
} else {
return (isAnnexInverted ? true : false);
}
} else {
theSet = annexPlane;
theChar &= 0xFFFF;
}
}
isInverted = __CFCSetIsInverted(theSet);
switch (__CFCSetClassType(theSet)) {
case __kCFCharSetClassBuiltin:
result = (CFUniCharIsMemberOf(theChar, __CFCSetBuiltinType(theSet)) ? !isInverted : isInverted);
break;
case __kCFCharSetClassRange:
length = __CFCSetRangeLength(theSet);
result = (length && __CFCSetRangeFirstChar(theSet) <= theChar && theChar < __CFCSetRangeFirstChar(theSet) + length ? !isInverted : isInverted);
break;
case __kCFCharSetClassString:
result = ((length = __CFCSetStringLength(theSet)) ? (__CFCSetBsearchUniChar(__CFCSetStringBuffer(theSet), length, theChar) ? !isInverted : isInverted) : isInverted);
break;
case __kCFCharSetClassBitmap:
result = (__CFCSetCompactBitmapBits(theSet) ? (__CFCSetIsMemberBitmap(__CFCSetBitmapBits(theSet), theChar) ? true : false) : isInverted);
break;
case __kCFCharSetClassCompactBitmap:
result = (__CFCSetCompactBitmapBits(theSet) ? (__CFCSetIsMemberInCompactBitmap(__CFCSetCompactBitmapBits(theSet), theChar) ? true : false) : isInverted);
break;
default:
CFAssert1(0, __kCFLogAssertion, "%s: Internal inconsistency error: unknown character set type", __PRETTY_FUNCTION__); // We should never come here
return false; // To make compiler happy
}
return (result ? !isAnnexInverted : isAnnexInverted);
}
So we can follow along, and figure out what's going on. Unfortunately we have to bust out our x86_64 assembly skills to do it. But fear not, for I have done this for you already, because apparently this is what I do for fun on a Friday night.
A helpful thing to have is the data structure:
struct __CFCharacterSet {
CFRuntimeBase _base;
CFHashCode _hashValue;
union {
struct {
CFIndex _type;
} _builtin;
struct {
UInt32 _firstChar;
CFIndex _length;
} _range;
struct {
UniChar *_buffer;
CFIndex _length;
} _string;
struct {
uint8_t *_bits;
} _bitmap;
struct {
uint8_t *_cBits;
} _compactBitmap;
} _variants;
CFCharSetAnnexStruct *_annex;
};
We'll need to know what the heck CFRuntimeBase
is, too:
typedef struct __CFRuntimeBase {
uintptr_t _cfisa;
uint8_t _cfinfo[4];
#if __LP64__
uint32_t _rc;
#endif
} CFRuntimeBase;
And guess what! There are also some constants that we'll need.
enum {
__kCFCharSetClassTypeMask = 0x0070,
__kCFCharSetClassBuiltin = 0x0000,
__kCFCharSetClassRange = 0x0010,
__kCFCharSetClassString = 0x0020,
__kCFCharSetClassBitmap = 0x0030,
__kCFCharSetClassSet = 0x0040,
__kCFCharSetClassCompactBitmap = 0x0040,
// irrelevant stuff redacted
};
We can then break on CFCharacterSetIsLongCharacterMember
and log the structure:
supersetA.contains(UnicodeScalar(128518)!)
(lldb) po [NSData dataWithBytes:$rdi length:48]
<21b3d2ad ffff1d00 90190000 02000000 00000000 00000000 06f60100 00000000 01000000 00000000 00000000 00000000>
Based on the structs above, we can figure out what this character set is made of. In this case, the relevant part is going to be the first byte of cfinfo
from CFRuntimeBase
, which are bytes 9-12. The first byte of this, 0x90
contains the type information for the character set. It needs to be AND
ed with __kCFCharSetClassTypeMask
, which gets us 0x10
, which is __kCFCharSetClassRange
.
For this line:
supersetB.contains(UnicodeScalar(128518)!)
the structure is:
(lldb) po [NSData dataWithBytes:$rdi length:48]
<21b3d2ad ffff1d00 a0190000 02000000 00000000 00000000 9066f000 01000000 02000000 00000000 00000000 00000000>
This time byte 9 is 0xa0
, which AND
ed with the mask is 0x20
, __kCFCharSetClassString
.
At this point the Monty Python cast are screaming "Get On With It!", so let's go through the source for CFCharacterSetIsLongCharacterMember
and see what's going on.
Skipping past all the CF_OBJC_FUNCDISPATCHV
crap, we get to this line:
if (plane) {
This obviously evaluates to true in both cases. Next test:
if (__CFCSetIsBuiltin(theSet)) {
This evaluates to false in both cases, since neither type was __kCFCharSetClassBuiltin
, so we skip that block.
isAnnexInverted = __CFCSetAnnexIsInverted(theSet);
In both cases, the _annex
pointer was null (see all the zeros at the end of the structure), so this is false
.
This test will be true
for the same reason:
if ((annexPlane = __CFCSetGetAnnexPlaneCharacterSetNoAlloc(theSet, plane)) == NULL) {
taking us to:
if (!__CFCSetHasNonBMPPlane(theSet) && __CFCSetIsRange(theSet)) {
The __CFCSetHasNonBMPPlane
macro checks _annex
, so that's false. The emoji, of course, is not in the BMP plane, so this actually seems wrong for both cases, even the one that was returning the correct result.
__CFCSetIsRange
checks if our type is __kCFCharSetClassRange
, which is only true the first time. So this is our point of divergence. The second invocation of this, which produces the incorrect result, returns on the next line:
return (isAnnexInverted ? true : false);
And since the annex is NULL
, causing isAnnexInverted
to be false, this returns false.
As for how to fix it... well, I can't. But now we know why it happened. From what I can tell, the main problem is that the _annex
field isn't being filled when the character set is created, and since the annex seems to be used to keep track of characters in non-BMP planes, I think it ought to be present for both of the character sets. Incidentally, this information will probably be helpful in a bug report should you decide to file one (I'd file it against CoreFoundation, since that's where the actual issue is).
Strange Behavior In CharacterSet.contains() Method, With High UTF8 Characters Mixed With ASCII
It seems that CharacterSet.init(charactersIn string: String)
does not work correctly if the string contains characters from both inside and outside the BMP (basic multilingual plane):
let s = " quot;
let cs = CharacterSet(charactersIn: s)
s.unicodeScalars.forEach {
print(cs.contains($0))
}
// Expected output: true, true
// Actual output: true, false
A workaround is to use create the character set from the sequence of Unicode scalars instead:
let cs = CharacterSet(s.unicodeScalars)
This will produce the expected output.
But note that this cannot handle the full range of Swift Character
s (which include grapheme clusters consisting of multiple Unicode scalars). Therefore you might want to work with a Set<Character>
instead.
NSArray from NSCharacterSet
The following code creates an array containing all characters of a given character set. It works also for characters outside of the "basic multilingual plane" (characters > U+FFFF, e.g. U+10400 DESERET CAPITAL LETTER LONG I).
NSCharacterSet *charset = [NSCharacterSet uppercaseLetterCharacterSet];
NSMutableArray *array = [NSMutableArray array];
for (int plane = 0; plane <= 16; plane++) {
if ([charset hasMemberInPlane:plane]) {
UTF32Char c;
for (c = plane << 16; c < (plane+1) << 16; c++) {
if ([charset longCharacterIsMember:c]) {
UTF32Char c1 = OSSwapHostToLittleInt32(c); // To make it byte-order safe
NSString *s = [[NSString alloc] initWithBytes:&c1 length:4 encoding:NSUTF32LittleEndianStringEncoding];
[array addObject:s];
}
}
}
}
For the uppercaseLetterCharacterSet
this gives an array of 1467 elements. But note that characters > U+FFFF are stored as UTF-16 surrogate pair in NSString
, so for example U+10400 actually is stored in NSString
as 2 characters "\uD801\uDC00".
Swift 2 code can be found in other answers to this question.
Here is a Swift 3 version, written as an extension method:
extension CharacterSet {
func allCharacters() -> [Character] {
var result: [Character] = []
for plane: UInt8 in 0...16 where self.hasMember(inPlane: plane) {
for unicode in UInt32(plane) << 16 ..< UInt32(plane + 1) << 16 {
if let uniChar = UnicodeScalar(unicode), self.contains(uniChar) {
result.append(Character(uniChar))
}
}
}
return result
}
}
Example:
let charset = CharacterSet.uppercaseLetters
let chars = charset.allCharacters()
print(chars.count) // 1521
print(chars) // ["A", "B", "C", ... "]
(Note that some characters may not be present in the font used to
display the result.)
Swift string indexing combines \r\n as one char instead of two
TLDR: \r\n
is a grapheme cluster and is treated as a single Character
in Swift because Unicode.
Swift treats
\r\n
as oneCharacter
.Objective-C
NSString
treats it as two characters (in terms of the result fromlength
).
On the swift-users forum someone wrote:
– "\r\n" is a single
Character
. Is this the correct behaviour?– Yes, a
Character
corresponds to a Unicode grapheme cluster, and "\r\n" is considered a single grapheme cluster.
And the subsequent response posted a link to Unicode documentation, check out this table which officially states CRLF is a grapheme cluster.
Take a look at the Apple documentation on Characters and Grapheme Clusters.
It's common to think of a string as a sequence of characters, but when working with NSString objects, or with Unicode strings in general, in most cases it is better to deal with substrings rather than with individual characters. The reason for this is that what the user perceives as a character in text may in many cases be represented by multiple characters in the string.
The Swift documentation on Strings and Characters is also worth reading.
This overview from objc.io is interesting as well.
NSString
represents UTF-16-encoded text. Length, indices, and ranges are all based on UTF-16 code units.
Another example of this is an emoji like . This single character is actually %uD83D%uDC4D%uD83C%uDFFB, four different unicode scalars. But if you called count
on a string with just that emoji you'd (correctly) get 1
.
If you wanted to see the scalars you could iterate them as follows:
for scalar in text.unicodeScalars {
print("\(scalar.value) ", terminator: "")
}
Which for "\r\n"
would give you 13 10
In the Swift documentation you'll find why NSString
is different:
The count of the characters returned by the count property isn’t always the same as the length property of an NSString that contains the same characters. The length of an NSString is based on the number of 16-bit code units within the string’s UTF-16 representation and not the number of Unicode extended grapheme clusters within the string.
Thus this isn't really "strange" behaviour of Swift string indexing, but rather a result of how Unicode treats these characters and how String
in Swift is designed. Swift string indexing goes by Character
and \r\n
is a single Character
.
Remove all non-numeric characters from a string in swift
I was hoping there would be something like stringFromCharactersInSet() which would allow me to specify only valid characters to keep.
You can either use trimmingCharacters
with the inverted
character set to remove characters from the start or the end of the string. In Swift 3 and later:
let result = string.trimmingCharacters(in: CharacterSet(charactersIn: "0123456789.").inverted)
Or, if you want to remove non-numeric characters anywhere in the string (not just the start or end), you can filter
the characters
, e.g. in Swift 4.2.1:
let result = string.filter("0123456789.".contains)
Or, if you want to remove characters from a CharacterSet from anywhere in the string, use:
let result = String(string.unicodeScalars.filter(CharacterSet.whitespaces.inverted.contains))
Or, if you want to only match valid strings of a certain format (e.g. ####.##
), you could use regular expression. For example:
if let range = string.range(of: #"\d+(\.\d*)?"#, options: .regularExpression) {
let result = string[range] // or `String(string[range])` if you need `String`
}
The behavior of these different approaches differ slightly so it just depends on precisely what you're trying to do. Include or exclude the decimal point if you want decimal numbers, or just integers. There are lots of ways to accomplish this.
For older, Swift 2 syntax, see previous revision of this answer.
Why are emoji characters like 👩👩👧👦 treated so strangely in Swift strings?
This has to do with how the String
type works in Swift, and how the contains(_:)
method works.
The ' ' is what's known as an emoji sequence, which is rendered as one visible character in a string. The sequence is made up of Character
objects, and at the same time it is made up of UnicodeScalar
objects.
If you check the character count of the string, you'll see that it is made up of four characters, while if you check the unicode scalar count, it will show you a different result:
print(".characters.count) // 4
print(".unicodeScalars.count) // 7
Now, if you parse through the characters and print them, you'll see what seems like normal characters, but in fact the three first characters contain both an emoji as well as a zero-width joiner in their UnicodeScalarView
:
for char in ".characters {
print(char)
let scalars = String(char).unicodeScalars.map({ String($0.value, radix: 16) })
print(scalars)
}
//
// ["1f469", "200d"]
//
// ["1f469", "200d"]
//
// ["1f467", "200d"]
// br>// ["1f466"]
As you can see, only the last character does not contain a zero-width joiner, so when using the contains(_:)
method, it works as you'd expect. Since you aren't comparing against emoji containing zero-width joiners, the method won't find a match for any but the last character.
To expand on this, if you create a String
which is composed of an emoji character ending with a zero-width joiner, and pass it to the contains(_:)
method, it will also evaluate to false
. This has to do with contains(_:)
being the exact same as range(of:) != nil
, which tries to find an exact match to the given argument. Since characters ending with a zero-width joiner form an incomplete sequence, the method tries to find a match for the argument while combining characters ending with a zero-width joiners into a complete sequence. This means that the method won't ever find a match if:
- the argument ends with a zero-width joiner, and
- the string to parse doesn't contain an incomplete sequence (i.e. ending with a zero-width joiner and not followed by a compatible character).
To demonstrate:
let s = "\u{1f469}\u{200d}\u{1f469}\u{200d}\u{1f467}\u{200d}\u{1f466}" // br>
s.range(of: "\u{1f469}\u{200d}") != nil // false
s.range(of: "\u{1f469}\u{200d}\u{1f469}") != nil // false
However, since the comparison only looks ahead, you can find several other complete sequences within the string by working backwards:
s.range(of: "\u{1f466}") != nil // true
s.range(of: "\u{1f467}\u{200d}\u{1f466}") != nil // true
s.range(of: "\u{1f469}\u{200d}\u{1f467}\u{200d}\u{1f466}") != nil // true
// Same as the above:
s.contains("\u{1f469}\u{200d}\u{1f467}\u{200d}\u{1f466}") // true
The easiest solution would be to provide a specific compare option to the range(of:options:range:locale:)
method. The option String.CompareOptions.literal
performs the comparison on an exact character-by-character equivalence. As a side note, what's meant by character here is not the Swift Character
, but the UTF-16 representation of both the instance and comparison string – however, since String
doesn't allow malformed UTF-16, this is essentially equivalent to comparing the Unicode scalar representation.
Here I've overloaded the Foundation
method, so if you need the original one, rename this one or something:
extension String {
func contains(_ string: String) -> Bool {
return self.range(of: string, options: String.CompareOptions.literal) != nil
}
}
Now the method works as it "should" with each character, even with incomplete sequences:
s.contains(") // true
s.contains("\u{200d}") // true
s.contains("\u{200d}") // true
Related Topics
"Ambiguous Reference to Member Map" When Attempting to Append/Replace Array Element
Get Png Representation of Nsimage in Swift
Nstoolbarflexiblespaceitem Is Constraint to Nssplitviewitem in Swift
How to Draw a Line Between Two Points Over an Image in Swift 3
Type 'Bundle' Has No Member "Module"
Swiftycam Capture Session Is Not Running
Apple Turicreate Always Return The Same Label
How to Correctly Use Shouldcompactonlaunch in Realmswift
Swift 3 Google Map Add Markers on Touch
How to Observe Object's Property in Rxswift
Can't Get Throws to Work with Function with Completion Handler
Switch to Match Multiple Cases from Optionsettype
Uibutton Action Is Not Triggered After Constraint Layouts Changed
Preferredstatusbarupdateanimation Being Ignored
Swift Increment Int! Not Working