|
Revision 5609, 0.6 kB
(checked in by mtredinnick, 2 years ago)
|
Merged Unicode branch into trunk (r4952:5608). This should be fully
backwards compatible for all practical purposes.
Fixed #2391, #2489, #2996, #3322, #3344, #3370, #3406, #3432, #3454, #3492, #3582, #3690, #3878, #3891, #3937, #4039, #4141, #4227, #4286, #4291, #4300, #4452, #4702
|
- Property svn:eol-style set to
native
- Property svn:keywords set to
LastChangedRevision
|
| Line | |
|---|
| 1 |
# Performance note: I benchmarked this code using a set instead of |
|---|
| 2 |
# a list for the stopwords and was surprised to find that the list |
|---|
| 3 |
# performed /better/ than the set - maybe because it's only a small |
|---|
| 4 |
# list. |
|---|
| 5 |
|
|---|
| 6 |
stopwords = ''' |
|---|
| 7 |
i |
|---|
| 8 |
a |
|---|
| 9 |
an |
|---|
| 10 |
are |
|---|
| 11 |
as |
|---|
| 12 |
at |
|---|
| 13 |
be |
|---|
| 14 |
by |
|---|
| 15 |
for |
|---|
| 16 |
from |
|---|
| 17 |
how |
|---|
| 18 |
in |
|---|
| 19 |
is |
|---|
| 20 |
it |
|---|
| 21 |
of |
|---|
| 22 |
on |
|---|
| 23 |
or |
|---|
| 24 |
that |
|---|
| 25 |
the |
|---|
| 26 |
this |
|---|
| 27 |
to |
|---|
| 28 |
was |
|---|
| 29 |
what |
|---|
| 30 |
when |
|---|
| 31 |
where |
|---|
| 32 |
'''.split() |
|---|
| 33 |
|
|---|
| 34 |
def strip_stopwords(sentence): |
|---|
| 35 |
"Removes stopwords - also normalizes whitespace" |
|---|
| 36 |
words = sentence.split() |
|---|
| 37 |
sentence = [] |
|---|
| 38 |
for word in words: |
|---|
| 39 |
if word.lower() not in stopwords: |
|---|
| 40 |
sentence.append(word) |
|---|
| 41 |
return u' '.join(sentence) |
|---|