Fix tokenizer behavior on multiple contiguous split characters (#271)

Multiple contiguous spaces or punctuation characters should be clustered together when splitting
lucaong · Jul 17, 2024 · a8e6765 · a8e6765
1 parent 7178fa7
commit a8e6765
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 1 deletion.
diff --git a/src/MiniSearch.test.js b/src/MiniSearch.test.js
@@ -1712,6 +1712,12 @@ e forse del mio dir poco ti cale`
       expect(ms.search('الرأس').map(({ id }) => id)).toEqual([4])
       expect(ms.search('123').map(({ id }) => id)).toEqual([5])
     })
+
+    it('splits on multiple contiguous spaces or punctuation characters correctly', () => {
+      const tokenize = MiniSearch.getDefault('tokenize')
+
+      expect(tokenize('a  b...c ? d')).toEqual(['a', 'b', 'c', 'd'])
+    })
   })
 
   describe('autoSuggest', () => {

diff --git a/src/MiniSearch.ts b/src/MiniSearch.ts
@@ -2205,4 +2205,4 @@ const wait = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))
 
 // This regular expression matches any Unicode space, newline, or punctuation
 // character
-const SPACE_OR_PUNCTUATION = /[\n\r\p{Z}\p{P}]/u
+const SPACE_OR_PUNCTUATION = /[\n\r\p{Z}\p{P}]+/u