forked from norvig/pytudes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpal2.py
262 lines (227 loc) · 10.1 KB
/
pal2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import random, re, bisect, time
"""Produce Panama-ish Palindromes. Copyright (C) 2002-2008, Peter Norvig."""
################ Checking for Palindromes
def is_panama(s):
"Test if string s is a Panama-ish palindrome."
return is_palindrome(s) and is_unique(phrases(s))
def is_palindrome(s):
"Test if a string is a palindrome."
s1 = canonical(s)
return s1 == reversestr(s1)
def phrases(s):
"Break a string s into comma-separated phrases."
return [phrase.strip() for phrase in s.split(',')]
def canonical(word, sub=re.compile('''[-* \t\n\r.,;!?:()`"']''').sub):
"The canonical form for comparing: lowercase, no blanks or punctuation."
return sub('', word).lower()
################ Utilities
def reversestr(x):
"Reverse a string."
return x[::-1]
def is_unique(seq):
"Return true if seq has no duplicate elements."
return len(seq) == len(set(seq))
def update(obj, **entries):
"Change attributes of obj, according to the keyword args."
obj.__dict__.update(entries)
return obj
################ Reading in a dictionary
class PalDict:
"""A dictionary from which you can find canonical words that start or end
with a given canonical substring, and find the true name of a
canonical word with d.truename[canonicalword]."""
def __init__(self, k=1000, filename='npdict.txt'):
words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}
for tword in open(filename).read().splitlines():
word = canonical(tword)
words.append(word)
rwords.append(reversestr(word))
truename[word] = tword
words.sort()
rwords.sort()
update(self, k=k, words=words, rwords=rwords, truename=truename,
reversibles={}, rangek=range(k), tryharder=False)
def startswith(self, prefix):
"""Return up to k canonical words that start with prefix.
If there are more than k, choose from them at random."""
return self._k_startingwith(self.words, prefix)
def endswith(self, rsuffix):
"""Return up to k canonical words that end with the reversed suffix.
If you want words ending in 'ing', ask for d.endswith('gni').
If there are more than k, choose from them at random."""
return map(reversestr, self._k_startingwith(self.rwords, rsuffix))
def __contains__(self, word):
return word in self.truename
def reversible_words(self):
"Find words that have a reverse in the dict, like {'Camus': 'Sumac'}"
if not self.reversibles:
reversibles = self.reversibles
for rw in self.rwords:
if rw in self:
w = reversestr(rw)
if w != rw and w not in reversibles:
reversibles[w] = rw
self.reversibles = reversibles
return self.reversibles
def _k_startingwith(self, words, prefix):
start = bisect.bisect_left(words, prefix)
end = bisect.bisect(words, prefix + 'zzzz')
n = end - start
if self.k >= n: # get all the words that start with prefix
results = words[start:end]
else: # sample from words starting with prefix
indexes = random.sample(xrange(start, end), self.k)
results = [words[i] for i in indexes]
random.shuffle(results)
## Consider words that are prefixes of the prefix.
## This is very slow, so don't use it until late in the game.
if self.tryharder:
for i in range(3, len(prefix)):
w = prefix[0:i]
if ((words == self.words and w in self.truename) or
(words == self.rwords and reversestr(w) in self.truename)):
results.append(w)
return results
paldict = PalDict()
def anpdictshort():
"Find the words that are valid when every phrase must start with 'a'"
def segment(word): return [s for s in word.split('a') if s]
def valid(word): return all(reversestr(s) in segments for s in segment(word))
words = map(canonical, file('anpdict.txt'))
segments = set(s for w in words for s in segment(canonical(w)))
valid_words = [paldict.truename[w] for w in words if valid(w)]
file('anpdict-short.txt', 'w').write('\n'.join(valid_words))
################ Search for a palindrome
class Panama:
def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):
## .left and .right hold lists of canonical words
## .diff holds the number of characters that are not matched,
## positive for words on left, negative for right.
## .stack holds (action, side, arg) tuples
update(self, left=[], right=[], best=0, seen={}, diff=0, stack=[],
used_reversibles=False, starttime=time.clock(), dict=dict)
for word in L.split(','):
self.add('left', canonical(word))
for rword in reversestr(R).split(','):
self.add('right', canonical(reversestr(rword)))
self.consider_candidates()
def search(self, steps=50000000):
"Search for palindromes."
for _ in xrange(steps):
if not self.stack:
return 'done'
action, dir, substr, arg = self.stack[-1]
if action == 'added': # undo the last word added
self.remove(dir, arg)
elif action == 'trying' and arg: # try the next word if there is one
self.add(dir, arg.pop()) and self.consider_candidates()
elif action == 'trying' and not arg: # otherwise backtrack
self.stack.pop()
else:
raise ValueError(action)
def add(self, dir, word):
"add a word"
if word in self.seen:
return False
else:
getattr(self, dir).append(word)
self.diff += factor[dir] * len(word)
self.seen[word] = True
self.stack.append(('added', dir, '?', word))
return True
def remove(self, dir, word):
"remove a word"
oldword = getattr(self, dir).pop()
assert word == oldword
self.diff -= factor[dir] * len(word)
del self.seen[word]
self.stack.pop()
def consider_candidates(self):
"""Push a new state with a set of candidate words onto stack."""
if self.diff > 0: # Left is longer, consider adding on right
dir = 'right'
substr = self.left[-1][-self.diff:]
candidates = self.dict.endswith(substr)
elif self.diff < 0: # Right is longer, consider adding on left
dir = 'left'
substr = reversestr(self.right[-1][0:-self.diff])
candidates = self.dict.startswith(substr)
else: # Both sides are same size
dir = 'left'
if not self.used_reversibles:
self.report()
self.add_reversibles()
substr = ''
candidates = self.dict.startswith('')
if substr == reversestr(substr):
self.report()
self.stack.append(('trying', dir, substr, candidates))
def add_reversibles(self):
"Add in reversible words."
print 'using reversibles ...'
for (word, rword) in self.dict.reversible_words().items():
if word not in self.seen and rword not in self.seen:
self.add('left', word)
self.add('right', rword)
self.used_reversibles = True
self.stack = []
print '...done'
def report(self):
"Report a new palindrome to log file (if it is sufficiently big)."
N = len(self)
if N > 13333:
self.dict.tryharder = True
if N > self.best and (N > 12500 or N > self.best+500):
self.best = len(self)
self.bestphrase = str(self)
print '%5d phrases (%5d words) in %3d seconds' % (
self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime)
assert is_panama(self.bestphrase)
f = open('pallog%d.txt' % (id(self) % 10000), 'w')
f.write(self.bestphrase + '\n')
f.close()
def __len__(self):
return len(self.left) + len(self.right)
def __str__(self):
truename = self.dict.truename
lefts = [truename[w] for w in self.left]
rights =[truename[w] for w in self.right]
return ', '.join(lefts + rights[::-1])
factor = {'left': +1, 'right': -1}
# Note that we only allow one truename per canonical name. Occasionally
# this means we miss a good word (as in "a node" vs. "an ode"), but there
# are only 665 of these truename collisions, and most of them are of the
# form "a mark-up" vs. "a markup" so it seemed better to disallow them.
################ Unit Tests
def tests(p=Panama()):
assert is_panama('A man, a plan, a canal, Panama.')
assert is_panama('''A (man), a plan,,;, a ```canal?'' -- Panama!''')
assert not is_panama('A man, a plan, a radar, a canal, Panama.')
assert is_palindrome('A man, a plan, a canal, Panama.')
assert is_palindrome('radar, radar? radar!')
assert not is_palindrome('radars')
assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']
assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'
assert reversestr('foo') == 'oof'
assert is_unique([1, 2, 3])
assert not is_unique([1, 2, 2])
d = p.dict
def sameset(a, b): return set(a) == set(b)
assert 'panama' in d
assert d.words[0] in d
assert d.words[-1] in d
assert sameset(d.startswith('aword'), ['awording', 'awordbreak',
'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',
'awordlessness', 'aword', 'awordsmith'])
assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',
'asubglobosity', 'anonverbosity', 'agibbosity'])
d.tryharder = True
assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])
d.tryharder = False
assert d.startswith('oklahoma') == ['oklahoma']
assert d.startswith('fsfdsfdsfds') == []
print 'all tests pass'
if __name__ == '__main__':
p = Panama();
tests(p)
p.search()