Skip to content

Commit

Permalink
Fix tokenizer behavior on multiple contiguous split characters (#271)
Browse files Browse the repository at this point in the history
Multiple contiguous spaces or punctuation characters should be clustered
together when splitting
  • Loading branch information
lucaong authored Jul 17, 2024
1 parent 7178fa7 commit a8e6765
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
6 changes: 6 additions & 0 deletions src/MiniSearch.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1712,6 +1712,12 @@ e forse del mio dir poco ti cale`
expect(ms.search('الرأس').map(({ id }) => id)).toEqual([4])
expect(ms.search('123').map(({ id }) => id)).toEqual([5])
})

it('splits on multiple contiguous spaces or punctuation characters correctly', () => {
const tokenize = MiniSearch.getDefault('tokenize')

expect(tokenize('a b...c ? d')).toEqual(['a', 'b', 'c', 'd'])
})
})

describe('autoSuggest', () => {
Expand Down
2 changes: 1 addition & 1 deletion src/MiniSearch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2205,4 +2205,4 @@ const wait = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms))

// This regular expression matches any Unicode space, newline, or punctuation
// character
const SPACE_OR_PUNCTUATION = /[\n\r\p{Z}\p{P}]/u
const SPACE_OR_PUNCTUATION = /[\n\r\p{Z}\p{P}]+/u

0 comments on commit a8e6765

Please sign in to comment.