Escolar Documentos
Profissional Documentos
Cultura Documentos
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_keywords": {
"type": "keyword_marker",
"keywords": []
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"brazilian_stop",
"brazilian_keywords"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
https://gist.github.com/alopes/5358189#file-stopwords-txt
-- INI_STOPWORDS --
br_stopwords
------------
https://github.com/apache/lucene-solr/blob/master/lucene/analysis/common/src/
resources/org/apache/lucene/analysis/br/stopwords.txt
a
ainda
alem
ambas
ambos
antes
ao
aonde
aos
apos
aquele
aqueles
as
assim
com
como
contra
contudo
cuja
cujas
cujo
cujos
da
das
de
dela
dele
deles
demais
depois
desde
desta
deste
dispoe
dispoem
diversa
diversas
diversos
do
dos
durante
e
ela
elas
ele
eles
em
entao
entre
essa
essas
esse
esses
esta
estas
este
estes
ha
isso
isto
logo
mais
mas
mediante
menos
mesma
mesmas
mesmo
mesmos
na
nas
nao
nas
nem
nesse
neste
nos
o
os
ou
outra
outras
outro
outros
pelas
pelas
pelo
pelos
perante
pois
por
porque
portanto
proprio
propios
quais
qual
qualquer
quando
quanto
que
quem
quer
se
seja
sem
sendo
seu
seus
sob
sobre
sua
suas
tal
tambem
teu
teus
toda
todas
todo
todos
tua
tuas
tudo
um
uma
umas
uns
-- END_STOPWORDS --
ALTERNATIVA:
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
ALTERNATIVA:
{
"settings": {
"analysis": {
"filter" : {
"DEJT_remove_colons" : {
"type": "pattern_replace",
"pattern": ",",
"replacement": ""
}
},
"analyzer": {
"DEJT_analyzer": {
"type" : "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"DEJT_remove_colons"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "DEJT_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
Alternativa:
{
"settings": {
"analysis": {
"filter" : {
"DEJT_remove_colons" :
{
"type": "pattern_replace",
"pattern": "[,.]",
"replacement": ""
}
},
"analyzer": {
"DEJT_analyzer": {
"type" : "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"DEJT_remove_colons"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "DEJT_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
Alternativa:
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_keywords": {
"type": "keyword_marker",
"keywords": [","]
}
},
"char_filter" : {
"DEJT_char_filter" : {
"type" : "mapping",
"mappings" : [
", => ",
". => "
]
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"brazilian_stop",
"brazilian_keywords"
],
"char_filter" : [
"DEJT_char_filter"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
Alternativa:
{
"settings": {
"analysis": {
"filter" : {
"my_word_delimiter" : {
"type" : "word_delimiter",
"generate_word_parts" : false,
"generate_number_parts" : false,
"split_on_case_change" : false,
"split_on_numerics" : false
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding",
"my_word_delimiter"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
Alternativa:
{
"settings": {
"analysis": {
"filter" : {
"my_stop" : {
"type" : "stop",
"stopwords" : [ "," , "." ]
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"my_stop"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
Alternativa:
{
"settings": {
"analysis": {
"filter" : {
"my_stop" : {
"type" : "stop",
"stopwords" : [ "," , "." ]
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"٠ => ",
", => "
]
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "whitespace",
"filter": [
"lowercase",
"asciifolding",
"my_stop"
],
"char_filter" : [
"my_char_filter"
]
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
Alternativa:
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[,. ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
TOKENIZER FC-05:
WordStop
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
-------
28/05
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"custom_stop" : {
"type" : "stop",
"stopwords" : [ "para" ]
},
"brazilian_keywords": {
"type": "keyword_marker",
"keywords": []
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"stopwords" : [ "para" ],
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
-----
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"stopwords" : [ "para" ],
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
---
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"brazilian_keywords": {
"type": "keyword_marker",
"keywords": []
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"stopwords" : [ "para" ],
"filter": [
"lowercase",
"asciifolding",
"brazilian_stop"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
--
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
--
Ultimo:
{
"settings": {
"analysis": {
"filter": {
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"custom_stop" : {
"type" : "stop",
"stopwords" : [ "para" ]
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding",
"brazilian_stop",
"custom_stop"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
--
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
---
Ultimo:
{
"settings": {
"analysis": {
"filter" : {
"my_shingle_filter" : {
"type": "shingle",
"min_shingle_size": 2,
"max_shingle_size": 2,
"output_unigrams": false
},
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
},
"custom_stop" : {
"type" : "stop",
"stopwords" : [ "para" ]
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding",
"my_shingle_filter",
"brazilian_stop",
"custom_stop"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
--
ultimo:
{
"settings": {
"analysis": {
"filter" : {
"my_shingle_filter" : {
"type": "shingle",
"min_shingle_size": 3,
"max_shingle_size": 5,
"output_unigrams": true
},
"brazilian_stop": {
"type": "stop",
"stopwords": "_brazilian_"
}
},
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding",
"my_shingle_filter",
"brazilian_stop"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "[)(,.;\n ]"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
--
Usar esse!!!!
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "(?<=[\\p{L}\\d])([:;,.?!-])(?=[\\s\\n])|(?<=[\\p{L}])(\\/)
(?=[\\p{L}])|(?<=[\\p{L}\\d])(\\,)(?=[\\p{L}\\s])|(?<=[\\p{L}\\d)(])(\\.)([\\s\\n]|
$)|(?<=\\s)(-)(?=\\s)|([\\s\\(\\)\\[\\]\\{\\}])|(?<=[\\d)(])(\\/)(?=[\\d])|(\\")|
([\\.\\_\\-]{2,})|([a-z])(?=[\\d]{5,6})"
}
}
}
},
"mappings": {
"properties": {
"content": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"file" : {
"properties" : {
"filename" : {
"type" : "keyword",
"store" : true
}
}
}
}
}
}
(?<=[\\p{L}])(\\/)(?=[\\p{L}])|(?<=[\\p{L}\\d])(\\,)(?=[\\p{L}\\s])|(?<=[\\p{L}\\
d])(\\:)(?=\\s)|(?<=[\\p{L}\\d])(\\.)([\\s\\n]|$)|(?<=\\s)(-)(?=\\s)|([\\s\\(\\)])
-> 4X. Captura o ponto final depois de palavras e números, ex: Valor de R$
1.234,30.
(?<=[\p{L}\d)(])(\.)([\s\n]|$)
TODO TODO TODO: Não pega minha matrícula com final em ponto-e-virgula!!!!!!!!!!
(?<=[\p{L}\d])([:;,.?!-])(?=[\s\n])|(?<=[\p{L}])(\/)(?=[\p{L}])|(?<=[\p{L}\d])(\,)
(?=[\p{L}\s])|(?<=[\p{L}\d)(])(\.)([\s\n]|$)|(?<=\s)(-)(?=\s)|([\s\(\)\[\]\{\}])|(?
<=[\d)(])(\/)(?=[\d])|(["])|([\.\_\-]{2,})|([a-z])(?=[\d]{5,6})
REGEX Otimizadas:
Regra 2: Captura a barra (/) quando estiver separando palavras (Brasilia/DF, por
exemplo)
REGEX: (?<=[\p{L}])(\/)(?=[\p{L}])
REGEX_JAVA: (?<=[\\p{L}])(\\/)(?=[\\p{L}])
Regra 7: Quebrar números que estiverem separados por barra (/): Lei nº 8.112/1992,
Lei 8.112/92, Lei 8.112..
REGEX: (?<=[\d)(])(\/)(?=[\d])
REGEX_JAVA: (?<=[\\d)(])(\\/)(?=[\\d])
(?<=[\\p{L}\\d])([:;,.?!-])(?=[\\s\\n])|(?<=[\\p{L}])(\\/)(?=[\\p{L}])|(?<=[\\
p{L}\\d])(\\,)(?=[\\p{L}\\s])|(?<=[\\p{L}\\d)(])(\\.)([\\s\\n]|$)|(?<=\\s)(-)(?=\\
s)|([\\s\\(\\)\\[\\]\\{\\}])|(?<=[\\d)(])(\\/)(?=[\\d])|(\\")|([\\.\\_\\-]{2,})|
([a-z])(?=[\\d]{5,6})
Mapping PJE_Sentencas:
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
],
"char_filter" : [
"html_strip"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "(?<=[\\p{L}\\d])([:;,.?!-])(?=[\\s\\n])|(?<=[\\p{L}])(\\/)
(?=[\\p{L}])|(?<=[\\p{L}\\d])(\\,)(?=[\\p{L}\\s])|(?<=[\\p{L}\\d)(])(\\.)([\\s\\n]|
$)|(?<=\\s)(-)(?=\\s)|([\\s\\(\\)\\[\\]\\{\\}])|(?<=[\\d)(])(\\/)(?=[\\d])|([\"])|
([\\.\\_\\-]{2,})|([a-z])(?=[\\d]{5,6})"
}
}
}
},
"mappings": {
"properties": {
"documento": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"documento_parsed": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
}
}
}
}
}
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type" : "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
"asciifolding"
],
"char_filter" : [
"html_strip"
]
}
},
"tokenizer" : {
"my_tokenizer" : {
"type": "pattern",
"pattern" : "(?<=[\\p{L}\\d])([:;,.?!-])(?=[\\s\\n])|(?<=[\\p{L}])(\\/)
(?=[\\p{L}])|(?<=[\\p{L}\\d])(\\,)(?=[\\p{L}\\s])|(?<=[\\p{L}\\d)(])(\\.)([\\s\\n]|
$)|(?<=\\s)(-)(?=\\s)|([\\s\\(\\)\\[\\]\\{\\}])|(?<=[\\d)(])(\\/)(?=[\\d])|([\"])|
([\\.\\_\\-]{2,})|([a-z])(?=[\\d]{5,6})"
}
}
}
},
"mappings": {
"properties": {
"documento": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
},
"documento_parsed": {
"type": "text",
"analyzer": "my_analyzer",
"term_vector" : "with_positions_offsets"
}
}
}
}
New REGEX:
1. Simulação de (\W+):
([^A-Za-z0-9\x{00C0}-\x{00FF}\x{00BA}\x{00AA}\/:.-]+)
2. Anterior não é letra nem digito, captura o hifen, o proximo é letra ou digito...
(?<=[^\p{L}\d])([-.])(?=[\p{L}\d])