#!/usr/bin/perl -s
use Encode;
use Lingua::PT::PLNbase;
use utf8::all;
my %ent = ### html5 entities
qw(
aacute á
Aacute Á
abreve ă
Abreve Ă
ac ∾
acd ∿
acirc â
Acirc Â
acute ´
acy а
Acy А
aelig æ
AElig Æ
afr 𝔞
Afr 𝔄
agrave à
Agrave À
alefsym ℵ
aleph ℵ
alpha α
Alpha Α
amacr ā
Amacr Ā
amalg ⨿
amp &
and ∧
And ⩓
andand ⩕
andd ⩜
andslope ⩘
andv ⩚
ang ∠
ange ⦤
angle ∠
angmsd ∡
angmsdaa ⦨
angmsdab ⦩
angmsdac ⦪
angmsdad ⦫
angmsdae ⦬
angmsdaf ⦭
angmsdag ⦮
angmsdah ⦯
angrt ∟
angrtvb ⊾
angrtvbd ⦝
angsph ∢
angst Å
angzarr ⍼
aogon ą
Aogon Ą
aopf 𝕒
Aopf 𝔸
ap ≈
apacir ⩯
ape ≊
apE ⩰
apid ≋
apos '
approx ≈
approxeq ≊
aring å
Aring Å
ascr 𝒶
Ascr 𝒜
Assign ≔
ast *
asymp ≈
asympeq ≍
atilde ã
Atilde Ã
auml ä
Auml Ä
awconint ∳
awint ⨑
backcong ≌
backepsilon ϶
backprime ‵
backsim ∽
backsimeq ⋍
Backslash ∖
Barv ⫧
barvee ⊽
barwed ⌅
Barwed ⌆
barwedge ⌅
bbrk ⎵
bbrktbrk ⎶
bcong ≌
bcy б
Bcy Б
bdquo „
becaus ∵
because ∵
Because ∵
bemptyv ⦰
bepsi ϶
bernou ℬ
Bernoullis ℬ
beta β
Beta Β
beth ℶ
between ≬
bfr 𝔟
Bfr 𝔅
bigcap ⋂
bigcirc ◯
bigcup ⋃
bigodot ⨀
bigoplus ⨁
bigotimes ⨂
bigsqcup ⨆
bigstar ★
bigtriangledown ▽
bigtriangleup △
biguplus ⨄
bigvee ⋁
bigwedge ⋀
bkarow ⤍
blacklozenge ⧫
blacksquare ▪
blacktriangle ▴
blacktriangledown ▾
blacktriangleleft ◂
blacktriangleright ▸
blank ␣
blk12 ▒
blk14 ░
blk34 ▓
block █
bnot ⌐
bNot ⫭
bopf 𝕓
Bopf 𝔹
bot ⊥
bottom ⊥
bowtie ⋈
boxbox ⧉
boxdl ┐
boxdL ╕
boxDl ╖
boxDL ╗
boxdr ┌
boxdR ╒
boxDr ╓
boxDR ╔
boxh ─
boxH ═
boxhd ┬
boxhD ╥
boxHd ╤
boxHD ╦
boxhu ┴
boxhU ╨
boxHu ╧
boxHU ╩
boxminus ⊟
boxplus ⊞
boxtimes ⊠
boxul ┘
boxuL ╛
boxUl ╜
boxUL ╝
boxur └
boxuR ╘
boxUr ╙
boxUR ╚
boxv │
boxV ║
boxvh ┼
boxvH ╪
boxVh ╫
boxVH ╬
boxvl ┤
boxvL ╡
boxVl ╢
boxVL ╣
boxvr ├
boxvR ╞
boxVr ╟
boxVR ╠
bprime ‵
breve ˘
Breve ˘
brvbar ¦
bscr 𝒷
Bscr ℬ
bsemi ⁏
bsim ∽
bsime ⋍
bsol \
bsolb ⧅
bull •
bullet •
bump ≎
bumpe ≏
bumpE ⪮
bumpeq ≏
Bumpeq ≎
cacute ć
Cacute Ć
cap ∩
Cap ⋒
capand ⩄
capbrcup ⩉
capcap ⩋
capcup ⩇
capdot ⩀
CapitalDifferentialD ⅅ
caret ⁁
caron ˇ
Cayleys ℭ
ccaps ⩍
ccaron č
Ccaron Č
ccedil ç
Ccedil Ç
ccirc ĉ
Ccirc Ĉ
Cconint ∰
ccups ⩌
ccupssm ⩐
cdot ċ
Cdot Ċ
cedil ¸
Cedilla ¸
cemptyv ⦲
cent ¢
centerdot ·
CenterDot ·
cfr 𝔠
Cfr ℭ
chcy ч
CHcy Ч
check ✓
checkmark ✓
chi χ
Chi Χ
cir ○
circ ˆ
circeq ≗
circlearrowleft ↺
circlearrowright ↻
circledast ⊛
circledcirc ⊚
circleddash ⊝
CircleDot ⊙
circledR ®
circledS Ⓢ
CircleMinus ⊖
CirclePlus ⊕
CircleTimes ⊗
cire ≗
cirE ⧃
cirfnint ⨐
cirmid ⫯
cirscir ⧂
ClockwiseContourIntegral ∲
CloseCurlyDoubleQuote ”
CloseCurlyQuote ’
clubs ♣
clubsuit ♣
colon :
Colon ∷
colone ≔
Colone ⩴
coloneq ≔
comma ,
commat @
comp ∁
compfn ∘
complement ∁
complexes ℂ
cong ≅
congdot ⩭
Congruent ≡
conint ∮
Conint ∯
ContourIntegral ∮
copf 𝕔
Copf ℂ
coprod ∐
Coproduct ∐
copy ©
COPY ©
copysr ℗
CounterClockwiseContourIntegral ∳
crarr ↵
cross ✗
Cross ⨯
cscr 𝒸
Cscr 𝒞
csub ⫏
csube ⫑
csup ⫐
csupe ⫒
ctdot ⋯
cudarrl ⤸
cudarrr ⤵
cuepr ⋞
cuesc ⋟
cularr ↶
cularrp ⤽
cup ∪
Cup ⋓
cupbrcap ⩈
cupcap ⩆
CupCap ≍
cupcup ⩊
cupdot ⊍
cupor ⩅
curarr ↷
curarrm ⤼
curlyeqprec ⋞
curlyeqsucc ⋟
curlyvee ⋎
curlywedge ⋏
curren ¤
curvearrowleft ↶
curvearrowright ↷
cuvee ⋎
cuwed ⋏
cwconint ∲
cwint ∱
cylcty ⌭
dagger †
Dagger ‡
daleth ℸ
darr ↓
dArr ⇓
Darr ↡
dash ‐
dashv ⊣
Dashv ⫤
dbkarow ⤏
dblac ˝
dcaron ď
Dcaron Ď
dcy д
Dcy Д
dd ⅆ
DD ⅅ
ddagger ‡
ddarr ⇊
DDotrahd ⤑
ddotseq ⩷
deg °
Del ∇
delta δ
Delta Δ
demptyv ⦱
dfisht ⥿
dfr 𝔡
Dfr 𝔇
dHar ⥥
dharl ⇃
dharr ⇂
DiacriticalAcute ´
DiacriticalDot ˙
DiacriticalDoubleAcute ˝
DiacriticalGrave `
DiacriticalTilde ˜
diam ⋄
diamond ⋄
Diamond ⋄
diamondsuit ♦
diams ♦
die ¨
DifferentialD ⅆ
digamma ϝ
disin ⋲
div ÷
divide ÷
divideontimes ⋇
divonx ⋇
djcy ђ
DJcy Ђ
dlcorn ⌞
dlcrop ⌍
dollar $
dopf 𝕕
Dopf 𝔻
dot ˙
Dot ¨
doteq ≐
doteqdot ≑
DotEqual ≐
dotminus ∸
dotplus ∔
dotsquare ⊡
doublebarwedge ⌆
DoubleContourIntegral ∯
DoubleDot ¨
DoubleDownArrow ⇓
DoubleLeftArrow ⇐
DoubleLeftRightArrow ⇔
DoubleLeftTee ⫤
DoubleLongLeftArrow ⟸
DoubleLongLeftRightArrow ⟺
DoubleLongRightArrow ⟹
DoubleRightArrow ⇒
DoubleRightTee ⊨
DoubleUpArrow ⇑
DoubleUpDownArrow ⇕
DoubleVerticalBar ∥
downarrow ↓
Downarrow ⇓
DownArrow ↓
DownArrowBar ⤓
DownArrowUpArrow ⇵
downdownarrows ⇊
downharpoonleft ⇃
downharpoonright ⇂
DownLeftRightVector ⥐
DownLeftTeeVector ⥞
DownLeftVector ↽
DownLeftVectorBar ⥖
DownRightTeeVector ⥟
DownRightVector ⇁
DownRightVectorBar ⥗
DownTee ⊤
DownTeeArrow ↧
drbkarow ⤐
drcorn ⌟
drcrop ⌌
dscr 𝒹
Dscr 𝒟
dscy ѕ
DScy Ѕ
dsol ⧶
dstrok đ
Dstrok Đ
dtdot ⋱
dtri ▿
dtrif ▾
duarr ⇵
duhar ⥯
dwangle ⦦
dzcy џ
DZcy Џ
dzigrarr ⟿
eacute é
Eacute É
easter ⩮
ecaron ě
Ecaron Ě
ecir ≖
ecirc ê
Ecirc Ê
ecolon ≕
ecy э
Ecy Э
eDDot ⩷
eDot ≑
edot ė
Edot Ė
ee ⅇ
efDot ≒
efr 𝔢
Efr 𝔈
eg ⪚
egrave è
Egrave È
egs ⪖
egsdot ⪘
el ⪙
Element ∈
elinters ⏧
ell ℓ
els ⪕
elsdot ⪗
emacr ē
Emacr Ē
empty ∅
emptyset ∅
EmptySmallSquare ◻
emptyv ∅
EmptyVerySmallSquare ▫
eng ŋ
ENG Ŋ
eogon ę
Eogon Ę
eopf 𝕖
Eopf 𝔼
epar ⋕
eparsl ⧣
eplus ⩱
epsi ϵ
epsilon ε
Epsilon Ε
epsiv ε
eqcirc ≖
eqcolon ≕
eqsim ≂
eqslantgtr ⪖
eqslantless ⪕
Equal ⩵
EqualTilde ≂
equest ≟
Equilibrium ⇌
equiv ≡
equivDD ⩸
eqvparsl ⧥
erarr ⥱
erDot ≓
escr ℯ
Escr ℰ
esdot ≐
esim ≂
Esim ⩳
eta η
Eta Η
eth ð
ETH Ð
euml ë
Euml Ë
euro €
excl !
exist ∃
Exists ∃
expectation ℰ
exponentiale ⅇ
ExponentialE ⅇ
fallingdotseq ≒
fcy ф
Fcy Ф
female ♀
ffilig ffi
fflig ff
ffllig ffl
ffr 𝔣
Ffr 𝔉
filig fi
FilledSmallSquare ◼
FilledVerySmallSquare ▪
flat ♭
fllig fl
fltns ▱
fnof ƒ
fopf 𝕗
Fopf 𝔽
forall ∀
ForAll ∀
fork ⋔
forkv ⫙
Fouriertrf ℱ
fpartint ⨍
frac12 ½
frac13 ⅓
frac14 ¼
frac15 ⅕
frac16 ⅙
frac18 ⅛
frac23 ⅔
frac25 ⅖
frac34 ¾
frac35 ⅗
frac38 ⅜
frac45 ⅘
frac56 ⅚
frac58 ⅝
frac78 ⅞
frasl ⁄
frown ⌢
fscr 𝒻
Fscr ℱ
gacute ǵ
gammad ϝ
Gammad Ϝ
gamma γ
Gamma Γ
gap ⪆
gbreve ğ
Gbreve Ğ
Gcedil Ģ
gcirc ĝ
Gcirc Ĝ
gcy г
Gcy Г
gdot ġ
Gdot Ġ
ge ≥
gE ≧
gel ⋛
gEl ⪌
geq ≥
geqq ≧
geqslant ⩾
ges ⩾
gescc ⪩
gesdot ⪀
gesdoto ⪂
gesdotol ⪄
gesles ⪔
gfr 𝔤
Gfr 𝔊
gg ≫
Gg ⋙
ggg ⋙
gimel ℷ
gjcy ѓ
GJcy Ѓ
gl ≷
gla ⪥
glE ⪒
glj ⪤
gnap ⪊
gnapprox ⪊
gne ⪈
gnE ≩
gneq ⪈
gneqq ≩
gnsim ⋧
gopf 𝕘
Gopf 𝔾
grave `
GreaterEqual ≥
GreaterEqualLess ⋛
GreaterFullEqual ≧
GreaterGreater ⪢
GreaterLess ≷
GreaterSlantEqual ⩾
GreaterTilde ≳
gscr ℊ
Gscr 𝒢
gsim ≳
gsime ⪎
gsiml ⪐
Gt ≫
gtcc ⪧
gtcir ⩺
gtdot ⋗
gt >
gtlPar ⦕
gtquest ⩼
gtrapprox ⪆
gtrarr ⥸
gtrdot ⋗
gtreqless ⋛
gtreqqless ⪌
gtrless ≷
gtrsim ≳
Hacek ˇ
half ½
hamilt ℋ
hardcy ъ
HARDcy Ъ
harr ↔
hArr ⇔
harrcir ⥈
harrw ↭
Hat ^
hbar ℏ
hcirc ĥ
Hcirc Ĥ
hearts ♥
heartsuit ♥
hellip …
hercon ⊹
hfr 𝔥
Hfr ℌ
HilbertSpace ℋ
hksearow ⤥
hkswarow ⤦
hoarr ⇿
homtht ∻
hookleftarrow ↩
hookrightarrow ↪
hopf 𝕙
Hopf ℍ
horbar ―
HorizontalLine ─
hscr 𝒽
Hscr ℋ
hslash ℏ
hstrok ħ
Hstrok Ħ
HumpDownHump ≎
HumpEqual ≏
hybull ⁃
hyphen ‐
iacute í
Iacute Í
icirc î
Icirc Î
icy и
Icy И
Idot İ
iecy е
IEcy Е
iexcl ¡
iff ⇔
ifr 𝔦
Ifr ℑ
igrave ì
Igrave Ì
ii ⅈ
iiiint ⨌
iiint ∭
iinfin ⧜
iiota ℩
ijlig ij
IJlig IJ
Im ℑ
imacr ī
Imacr Ī
image ℑ
ImaginaryI ⅈ
imagline ℐ
imagpart ℑ
imath ı
imof ⊷
imped Ƶ
Implies ⇒
in ∈
incare ℅
infin ∞
infintie ⧝
inodot ı
int ∫
Int ∬
intcal ⊺
integers ℤ
Integral ∫
intercal ⊺
Intersection ⋂
intlarhk ⨗
intprod ⨼
iocy ё
IOcy Ё
iogon į
Iogon Į
iopf 𝕚
Iopf 𝕀
iota ι
Iota Ι
iprod ⨼
iquest ¿
iscr 𝒾
Iscr ℐ
isin ∈
isindot ⋵
isinE ⋹
isins ⋴
isinsv ⋳
isinv ∈
itilde ĩ
Itilde Ĩ
iukcy і
Iukcy І
iuml ï
Iuml Ï
jcirc ĵ
Jcirc Ĵ
jcy й
Jcy Й
jfr 𝔧
Jfr 𝔍
jmath ȷ
jopf 𝕛
Jopf 𝕁
jscr 𝒿
Jscr 𝒥
jsercy ј
Jsercy Ј
jukcy є
Jukcy Є
kappav ϰ
kappa κ
Kappa Κ
kcedil ķ
Kcedil Ķ
kcy к
Kcy К
kfr 𝔨
Kfr 𝔎
kgreen ĸ
khcy х
KHcy Х
kjcy ќ
KJcy Ќ
kopf 𝕜
Kopf 𝕂
kscr 𝓀
Kscr 𝒦
lAarr ⇚
lacute ĺ
Lacute Ĺ
laemptyv ⦴
lagran ℒ
lambda λ
Lambda Λ
lang ⟨
Lang ⟪
langd ⦑
langle ⟨
lap ⪅
Laplacetrf ℒ
laquo «
larr ←
lArr ⇐
Larr ↞
larrb ⇤
larrbfs ⤟
larrfs ⤝
larrhk ↩
larrlp ↫
larrpl ⤹
larrsim ⥳
larrtl ↢
lat ⪫
latail ⤙
lAtail ⤛
late ⪭
lbarr ⤌
lBarr ⤎
lbbrk ❲
lbrace {
lbrack [
lbrke ⦋
lbrksld ⦏
lbrkslu ⦍
lcaron ľ
Lcaron Ľ
lcedil ļ
Lcedil Ļ
lceil ⌈
lcub {
lcy л
Lcy Л
ldca ⤶
ldquo “
ldquor „
ldrdhar ⥧
ldrushar ⥋
ldsh ↲
le ≤
lE ≦
LeftAngleBracket ⟨
leftarrow ←
Leftarrow ⇐
LeftArrow ←
LeftArrowBar ⇤
LeftArrowRightArrow ⇆
leftarrowtail ↢
LeftCeiling ⌈
LeftDoubleBracket ⟦
LeftDownTeeVector ⥡
LeftDownVector ⇃
LeftDownVectorBar ⥙
LeftFloor ⌊
leftharpoondown ↽
leftharpoonup ↼
leftleftarrows ⇇
leftrightarrow ↔
Leftrightarrow ⇔
LeftRightArrow ↔
leftrightarrows ⇆
leftrightharpoons ⇋
leftrightsquigarrow ↭
LeftRightVector ⥎
LeftTee ⊣
LeftTeeArrow ↤
LeftTeeVector ⥚
leftthreetimes ⋋
LeftTriangle ⊲
LeftTriangleBar ⧏
LeftTriangleEqual ⊴
LeftUpDownVector ⥑
LeftUpTeeVector ⥠
LeftUpVector ↿
LeftUpVectorBar ⥘
LeftVector ↼
LeftVectorBar ⥒
leg ⋚
lEg ⪋
leq ≤
leqq ≦
leqslant ⩽
les ⩽
lescc ⪨
lesdot ⩿
lesdoto ⪁
lesdotor ⪃
lesges ⪓
lessapprox ⪅
lessdot ⋖
lesseqgtr ⋚
lesseqqgtr ⪋
LessEqualGreater ⋚
LessFullEqual ≦
LessGreater ≶
lessgtr ≶
LessLess ⪡
lesssim ≲
LessSlantEqual ⩽
LessTilde ≲
lfisht ⥼
lfloor ⌊
lfr 𝔩
Lfr 𝔏
lg ≶
lgE ⪑
lHar ⥢
lhard ↽
lharu ↼
lharul ⥪
lhblk ▄
ljcy љ
LJcy Љ
ll ≪
Ll ⋘
llarr ⇇
llcorner ⌞
Lleftarrow ⇚
llhard ⥫
lltri ◺
lmidot ŀ
Lmidot Ŀ
lmoust ⎰
lmoustache ⎰
lnap ⪉
lnapprox ⪉
lne ⪇
lnE ≨
lneq ⪇
lneqq ≨
lnsim ⋦
loang ⟬
loarr ⇽
lobrk ⟦
longleftarrow ⟵
Longleftarrow ⟸
LongLeftArrow ⟵
longleftrightarrow ⟷
Longleftrightarrow ⟺
LongLeftRightArrow ⟷
longmapsto ⟼
longrightarrow ⟶
Longrightarrow ⟹
LongRightArrow ⟶
looparrowleft ↫
looparrowright ↬
lopar ⦅
lopf 𝕝
Lopf 𝕃
loplus ⨭
lotimes ⨴
lowast ∗
lowbar _
LowerLeftArrow ↙
LowerRightArrow ↘
loz ◊
lozenge ◊
lozf ⧫
lpar (
lparlt ⦓
lrarr ⇆
lrcorner ⌟
lrhar ⇋
lrhard ⥭
lrm
lrtri ⊿
lsaquo ‹
lscr 𝓁
Lscr ℒ
lsh ↰
Lsh ↰
lsim ≲
lsime ⪍
lsimg ⪏
lsqb [
lsquo ‘
lsquor ‚
lstrok ł
Lstrok Ł
Lt ≪
ltcc ⪦
ltcir ⩹
ltdot ⋖
lthree ⋋
ltimes ⋉
ltlarr ⥶
lt <
ltquest ⩻
ltri ◃
ltrie ⊴
ltrif ◂
ltrPar ⦖
lurdshar ⥊
luruhar ⥦
macr ¯
male ♂
malt ✠
maltese ✠
map ↦
Map ⤅
mapsto ↦
mapstodown ↧
mapstoleft ↤
mapstoup ↥
marker ▮
mcomma ⨩
mcy м
Mcy М
mdash —
mDDot ∺
measuredangle ∡
Mellintrf ℳ
mfr 𝔪
Mfr 𝔐
mho ℧
micro µ
mid ∣
midast *
midcir ⫰
middot ·
minus -
minusb ⊟
minusd ∸
minusdu ⨪
MinusPlus ∓
mlcp ⫛
mldr …
mnplus ∓
models ⊧
mopf 𝕞
Mopf 𝕄
mp ∓
mscr 𝓂
Mscr ℳ
mstpos ∾
multimap ⊸
mumap ⊸
mu μ
Mu Μ
nabla ∇
nacute ń
Nacute Ń
nap ≉
napos ʼn
napprox ≉
natur ♮
natural ♮
naturals ℕ
nbsp _SPACE_
ncap ⩃
ncaron ň
Ncaron Ň
ncedil ņ
Ncedil Ņ
ncong ≇
ncup ⩂
ncy н
Ncy Н
ndash –
ne ≠
nearhk ⤤
nearr ↗
neArr ⇗
nearrow ↗
nequiv ≢
nesear ⤨
NestedGreaterGreater ≫
NestedLessLess ≪
nexist ∄
nexists ∄
nfr 𝔫
Nfr 𝔑
nge ≱
ngeq ≱
ngsim ≵
ngt ≯
ngtr ≯
nharr ↮
nhArr ⇎
nhpar ⫲
ni ∋
nis ⋼
nisd ⋺
niv ∋
njcy њ
NJcy Њ
nlarr ↚
nlArr ⇍
nldr ‥
nle ≰
nleftarrow ↚
nLeftarrow ⇍
nleftrightarrow ↮
nLeftrightarrow ⇎
nleq ≰
nless ≮
nlsim ≴
nlt ≮
nltri ⋪
nltrie ⋬
nmid ∤
nopf 𝕟
Nopf ℕ
not ¬
Not ⫬
NotCongruent ≢
NotCupCap ≭
NotDoubleVerticalBar ∦
NotElement ∉
NotEqual ≠
NotExists ∄
NotGreater ≯
NotGreaterEqual ≱
NotGreaterLess ≹
NotGreaterTilde ≵
notin ∉
notinva ∉
notinvb ⋷
notinvc ⋶
NotLeftTriangle ⋪
NotLeftTriangleEqual ⋬
NotLess ≮
NotLessEqual ≰
NotLessGreater ≸
NotLessTilde ≴
notni ∌
notniva ∌
notnivb ⋾
notnivc ⋽
NotPrecedes ⊀
NotPrecedesSlantEqual ⋠
NotReverseElement ∌
NotRightTriangle ⋫
NotRightTriangleEqual ⋭
NotSquareSubsetEqual ⋢
NotSquareSupersetEqual ⋣
NotSubsetEqual ⊈
NotSucceeds ⊁
NotSucceedsSlantEqual ⋡
NotSupersetEqual ⊉
NotTilde ≁
NotTildeEqual ≄
NotTildeFullEqual ≇
NotTildeTilde ≉
NotVerticalBar ∤
npar ∦
nparallel ∦
npolint ⨔
npr ⊀
nprcue ⋠
nprec ⊀
nrarr ↛
nrArr ⇏
nrightarrow ↛
nRightarrow ⇏
nrtri ⋫
nrtrie ⋭
nsc ⊁
nsccue ⋡
nscr 𝓃
Nscr 𝒩
nshortmid ∤
nshortparallel ∦
nsim ≁
nsime ≄
nsimeq ≄
nsmid ∤
nspar ∦
nsqsube ⋢
nsqsupe ⋣
nsub ⊄
nsube ⊈
nsubseteq ⊈
nsucc ⊁
nsup ⊅
nsupe ⊉
nsupseteq ⊉
ntgl ≹
ntilde ñ
Ntilde Ñ
ntlg ≸
ntriangleleft ⋪
ntrianglelefteq ⋬
ntriangleright ⋫
ntrianglerighteq ⋭
num #
numero №
nu ν
Nu Ν
nvdash ⊬
nvDash ⊭
nVdash ⊮
nVDash ⊯
nvHarr ⤄
nvinfin ⧞
nvlArr ⤂
nvrArr ⤃
nwarhk ⤣
nwarr ↖
nwArr ⇖
nwarrow ↖
nwnear ⤧
oacute ó
Oacute Ó
oast ⊛
ocir ⊚
ocirc ô
Ocirc Ô
ocy о
Ocy О
odash ⊝
odblac ő
Odblac Ő
odiv ⨸
odot ⊙
odsold ⦼
oelig œ
OElig Œ
ofcir ⦿
ofr 𝔬
Ofr 𝔒
ogon ˛
ograve ò
Ograve Ò
ogt ⧁
ohbar ⦵
ohm Ω
oint ∮
olarr ↺
olcir ⦾
olcross ⦻
oline ‾
olt ⧀
omacr ō
Omacr Ō
omega ω
Omega Ω
omicron ο
Omicron Ο
omid ⦶
ominus ⊖
oopf 𝕠
Oopf 𝕆
opar ⦷
OpenCurlyDoubleQuote “
OpenCurlyQuote ‘
operp ⦹
oplus ⊕
or ∨
Or ⩔
orarr ↻
ord ⩝
order ℴ
orderof ℴ
ordf ª
ordm º
origof ⊶
oror ⩖
orslope ⩗
orv ⩛
oS Ⓢ
oscr ℴ
Oscr 𝒪
oslash ø
Oslash Ø
osol ⊘
otilde õ
Otilde Õ
otimes ⊗
Otimes ⨷
otimesas ⨶
ouml ö
Ouml Ö
ovbar ⌽
OverBar ¯
OverBrace ⏞
OverBracket ⎴
OverParenthesis ⏜
par ∥
para ¶
parallel ∥
parsim ⫳
parsl ⫽
part ∂
PartialD ∂
pcy п
Pcy П
percnt %
period .
permil ‰
perp ⊥
pertenk ‱
pfr 𝔭
Pfr 𝔓
phiv φ
phi φ
Phi Φ
phmmat ℳ
phone ☎
pitchfork ⋔
piv ϖ
pi π
Pi Π
planck ℏ
planckh ℎ
plankv ℏ
plus +
plusacir ⨣
plusb ⊞
pluscir ⨢
plusdo ∔
plusdu ⨥
pluse ⩲
PlusMinus ±
plusmn ±
plussim ⨦
plustwo ⨧
pm ±
Poincareplane ℌ
pointint ⨕
popf 𝕡
Popf ℙ
pound £
pr ≺
Pr ⪻
prap ⪷
prcue ≼
pre ⪯
prE ⪳
prec ≺
precapprox ⪷
preccurlyeq ≼
Precedes ≺
PrecedesEqual ⪯
PrecedesSlantEqual ≼
PrecedesTilde ≾
preceq ⪯
precnapprox ⪹
precneqq ⪵
precnsim ⋨
precsim ≾
prime ′
Prime ″
primes ℙ
prnap ⪹
prnE ⪵
prnsim ⋨
prod ∏
Product ∏
profalar ⌮
profline ⌒
profsurf ⌓
prop ∝
Proportion ∷
Proportional ∝
propto ∝
prsim ≾
prurel ⊰
pscr 𝓅
Pscr 𝒫
psi ψ
Psi Ψ
qfr 𝔮
Qfr 𝔔
qint ⨌
qopf 𝕢
Qopf ℚ
qprime ⁗
qscr 𝓆
Qscr 𝒬
quaternions ℍ
quatint ⨖
quest ?
questeq ≟
quot "
QUOT "
rAarr ⇛
race ⧚
racute ŕ
Racute Ŕ
radic √
raemptyv ⦳
rang ⟩
Rang ⟫
rangd ⦒
range ⦥
rangle ⟩
raquo »
rarr →
rArr ⇒
Rarr ↠
rarrap ⥵
rarrb ⇥
rarrbfs ⤠
rarrc ⤳
rarrfs ⤞
rarrhk ↪
rarrlp ↬
rarrpl ⥅
rarrsim ⥴
rarrtl ↣
Rarrtl ⤖
rarrw ↝
ratail ⤚
rAtail ⤜
ratio ∶
rationals ℚ
rbarr ⤍
rBarr ⤏
RBarr ⤐
rbbrk ❳
rbrace }
rbrack ]
rbrke ⦌
rbrksld ⦎
rbrkslu ⦐
rcaron ř
Rcaron Ř
rcedil ŗ
Rcedil Ŗ
rceil ⌉
rcub }
rcy р
Rcy Р
rdca ⤷
rdldhar ⥩
rdquo ”
rdquor ”
rdsh ↳
Re ℜ
real ℜ
realine ℛ
realpart ℜ
reals ℝ
rect ▭
reg ®
REG ®
ReverseElement ∋
ReverseEquilibrium ⇋
ReverseUpEquilibrium ⥯
rfisht ⥽
rfloor ⌋
rfr 𝔯
Rfr ℜ
rHar ⥤
rhard ⇁
rharu ⇀
rharul ⥬
rhov ϱ
rho ρ
Rho Ρ
RightAngleBracket ⟩
rightarrow →
Rightarrow ⇒
RightArrow →
RightArrowBar ⇥
RightArrowLeftArrow ⇄
rightarrowtail ↣
RightCeiling ⌉
RightDoubleBracket ⟧
RightDownTeeVector ⥝
RightDownVector ⇂
RightDownVectorBar ⥕
RightFloor ⌋
rightharpoondown ⇁
rightharpoonup ⇀
rightleftarrows ⇄
rightleftharpoons ⇌
rightrightarrows ⇉
rightsquigarrow ↝
RightTee ⊢
RightTeeArrow ↦
RightTeeVector ⥛
rightthreetimes ⋌
RightTriangle ⊳
RightTriangleBar ⧐
RightTriangleEqual ⊵
RightUpDownVector ⥏
RightUpTeeVector ⥜
RightUpVector ↾
RightUpVectorBar ⥔
RightVector ⇀
RightVectorBar ⥓
ring ˚
risingdotseq ≓
rlarr ⇄
rlhar ⇌
rlm
rmoust ⎱
rmoustache ⎱
rnmid ⫮
roang ⟭
roarr ⇾
robrk ⟧
ropar ⦆
ropf 𝕣
Ropf ℝ
roplus ⨮
rotimes ⨵
RoundImplies ⥰
rpar )
rpargt ⦔
rppolint ⨒
rrarr ⇉
Rrightarrow ⇛
rsaquo ›
rscr 𝓇
Rscr ℛ
rsh ↱
Rsh ↱
rsqb ]
rsquo ’
rsquor ’
rthree ⋌
rtimes ⋊
rtri ▹
rtrie ⊵
rtrif ▸
rtriltri ⧎
RuleDelayed ⧴
ruluhar ⥨
rx ℞
sacute ś
Sacute Ś
sbquo ‚
sc ≻
Sc ⪼
scap ⪸
scaron š
Scaron Š
sccue ≽
sce ⪰
scE ⪴
scedil ş
Scedil Ş
scirc ŝ
Scirc Ŝ
scnap ⪺
scnE ⪶
scnsim ⋩
scpolint ⨓
scsim ≿
scy с
Scy С
sdot ⋅
sdotb ⊡
sdote ⩦
searhk ⤥
searr ↘
seArr ⇘
searrow ↘
sect §
semi ;
seswar ⤩
setminus ∖
setmn ∖
sext ✶
sfr 𝔰
Sfr 𝔖
sfrown ⌢
sharp ♯
shchcy щ
SHCHcy Щ
shcy ш
SHcy Ш
ShortDownArrow ↓
ShortLeftArrow ←
shortmid ∣
shortparallel ∥
ShortRightArrow →
ShortUpArrow ↑
sigmaf ς
sigmav ς
sigma σ
Sigma Σ
sim ∼
simdot ⩪
sime ≃
simeq ≃
simg ⪞
simgE ⪠
siml ⪝
simlE ⪟
simne ≆
simplus ⨤
simrarr ⥲
slarr ←
SmallCircle ∘
smallsetminus ∖
smashp ⨳
smeparsl ⧤
smid ∣
smile ⌣
smt ⪪
smte ⪬
softcy ь
SOFTcy Ь
sol /
solb ⧄
solbar ⌿
sopf 𝕤
Sopf 𝕊
spades ♠
spadesuit ♠
spar ∥
sqcap ⊓
sqcup ⊔
Sqrt √
sqsub ⊏
sqsube ⊑
sqsubset ⊏
sqsubseteq ⊑
sqsup ⊐
sqsupe ⊒
sqsupset ⊐
sqsupseteq ⊒
squ □
square □
Square □
SquareIntersection ⊓
SquareSubset ⊏
SquareSubsetEqual ⊑
SquareSuperset ⊐
SquareSupersetEqual ⊒
SquareUnion ⊔
squarf ▪
squf ▪
srarr →
sscr 𝓈
Sscr 𝒮
ssetmn ∖
ssmile ⌣
sstarf ⋆
star ☆
Star ⋆
starf ★
straightepsilon ϵ
straightphi ϕ
strns ¯
sub ⊂
Sub ⋐
subdot ⪽
sube ⊆
subE ⫅
subedot ⫃
submult ⫁
subne ⊊
subnE ⫋
subplus ⪿
subrarr ⥹
subset ⊂
Subset ⋐
subseteq ⊆
subseteqq ⫅
SubsetEqual ⊆
subsetneq ⊊
subsetneqq ⫋
subsim ⫇
subsub ⫕
subsup ⫓
succ ≻
succapprox ⪸
succcurlyeq ≽
Succeeds ≻
SucceedsEqual ⪰
SucceedsSlantEqual ≽
SucceedsTilde ≿
succeq ⪰
succnapprox ⪺
succneqq ⪶
succnsim ⋩
succsim ≿
SuchThat ∋
sum ∑
Sum ∑
sung ♪
sup ⊃
Sup ⋑
sup1 ¹
sup2 ²
sup3 ³
supdot ⪾
supdsub ⫘
supe ⊇
supE ⫆
supedot ⫄
Superset ⊃
SupersetEqual ⊇
suphsub ⫗
suplarr ⥻
supmult ⫂
supne ⊋
supnE ⫌
supplus ⫀
supset ⊃
Supset ⋑
supseteq ⊇
supseteqq ⫆
supsetneq ⊋
supsetneqq ⫌
supsim ⫈
supsub ⫔
supsup ⫖
swarhk ⤦
swarr ↙
swArr ⇙
swarrow ↙
swnwar ⤪
szlig ß
target ⌖
tau τ
Tau Τ
tbrk ⎴
tcaron ť
Tcaron Ť
tcedil ţ
Tcedil Ţ
tcy т
Tcy Т
tdot =⃛
telrec ⌕
tfr 𝔱
Tfr 𝔗
there4 ∴
therefore ∴
Therefore ∴
thetasym ϑ
thetav ϑ
theta θ
Theta Θ
thickapprox ≈
thicksim ∼
thkap ≈
thksim ∼
thorn þ
THORN Þ
tilde ˜
Tilde ∼
TildeEqual ≃
TildeFullEqual ≅
TildeTilde ≈
times ×
timesb ⊠
timesbar ⨱
timesd ⨰
tint ∭
toea ⤨
top ⊤
topbot ⌶
topcir ⫱
topf 𝕥
Topf 𝕋
topfork ⫚
tosa ⤩
tprime ‴
trade ™
TRADE ™
triangle ▵
triangledown ▿
triangleleft ◃
trianglelefteq ⊴
triangleq ≜
triangleright ▹
trianglerighteq ⊵
tridot ◬
trie ≜
triminus ⨺
TripleDot =⃛
triplus ⨹
trisb ⧍
tritime ⨻
trpezium ⏢
tscr 𝓉
Tscr 𝒯
tscy ц
TScy Ц
tshcy ћ
TSHcy Ћ
tstrok ŧ
Tstrok Ŧ
twixt ≬
twoheadleftarrow ↞
twoheadrightarrow ↠
uacute ú
Uacute Ú
uarr ↑
uArr ⇑
Uarr ↟
Uarrocir ⥉
ubrcy ў
Ubrcy Ў
ubreve ŭ
Ubreve Ŭ
ucirc û
Ucirc Û
ucy у
Ucy У
udarr ⇅
udblac ű
Udblac Ű
udhar ⥮
ufisht ⥾
ufr 𝔲
Ufr 𝔘
ugrave ù
Ugrave Ù
uHar ⥣
uharl ↿
uharr ↾
uhblk ▀
ulcorn ⌜
ulcorner ⌜
ulcrop ⌏
ultri ◸
umacr ū
Umacr Ū
uml ¨
UnderBrace ⏟
UnderBracket ⎵
UnderParenthesis ⏝
Union ⋃
UnionPlus ⊎
uogon ų
Uogon Ų
uopf 𝕦
Uopf 𝕌
uparrow ↑
Uparrow ⇑
UpArrow ↑
UpArrowBar ⤒
UpArrowDownArrow ⇅
updownarrow ↕
Updownarrow ⇕
UpDownArrow ↕
UpEquilibrium ⥮
upharpoonleft ↿
upharpoonright ↾
uplus ⊎
UpperLeftArrow ↖
UpperRightArrow ↗
upsih ϒ
upsilon υ
Upsilon Υ
upsi υ
Upsi ϒ
UpTee ⊥
UpTeeArrow ↥
upuparrows ⇈
urcorn ⌝
urcorner ⌝
urcrop ⌎
uring ů
Uring Ů
urtri ◹
uscr 𝓊
Uscr 𝒰
utdot ⋰
utilde ũ
Utilde Ũ
utri ▵
utrif ▴
uuarr ⇈
uuml ü
Uuml Ü
uwangle ⦧
vangrt ⦜
varepsilon ε
varkappa ϰ
varnothing ∅
varphi φ
varpi ϖ
varpropto ∝
varr ↕
vArr ⇕
varrho ϱ
varsigma ς
vartheta ϑ
vartriangleleft ⊲
vartriangleright ⊳
vBar ⫨
Vbar ⫫
vBarv ⫩
vcy в
Vcy В
vdash ⊢
vDash ⊨
Vdash ⊩
VDash ⊫
Vdashl ⫦
vee ∨
Vee ⋁
veebar ⊻
veeeq ≚
vellip ⋮
verbar |
Verbar ‖
vert |
Vert ‖
VerticalBar ∣
VerticalLine |
VerticalSeparator ❘
VerticalTilde ≀
vfr 𝔳
Vfr 𝔙
vltri ⊲
vopf 𝕧
Vopf 𝕍
vprop ∝
vrtri ⊳
vscr 𝓋
Vscr 𝒱
Vvdash ⊪
vzigzag ⦚
wcirc ŵ
Wcirc Ŵ
wedbar ⩟
wedge ∧
Wedge ⋀
wedgeq ≙
weierp ℘
wfr 𝔴
Wfr 𝔚
wopf 𝕨
Wopf 𝕎
wp ℘
wr ≀
wreath ≀
wscr 𝓌
Wscr 𝒲
xcap ⋂
xcirc ◯
xcup ⋃
xdtri ▽
xfr 𝔵
Xfr 𝔛
xharr ⟷
xhArr ⟺
xi ξ
Xi Ξ
xlarr ⟵
xlArr ⟸
xmap ⟼
xnis ⋻
xodot ⨀
xopf 𝕩
Xopf 𝕏
xoplus ⨁
xotime ⨂
xrarr ⟶
xrArr ⟹
xscr 𝓍
Xscr 𝒳
xsqcup ⨆
xuplus ⨄
xutri △
xvee ⋁
xwedge ⋀
yacute ý
Yacute Ý
yacy я
YAcy Я
ycirc ŷ
Ycirc Ŷ
ycy ы
Ycy Ы
yen ¥
yfr 𝔶
Yfr 𝔜
yicy ї
YIcy Ї
yopf 𝕪
Yopf 𝕐
yscr 𝓎
Yscr 𝒴
yucy ю
YUcy Ю
yuml ÿ
Yuml Ÿ
zacute ź
Zacute Ź
zcaron ž
Zcaron Ž
zcy з
Zcy З
zdot ż
Zdot Ż
zeetrf ℨ
zeta ζ
Zeta Ζ
zfr 𝔷
Zfr ℨ
zhcy ж
ZHcy Ж
zigrarr ⇝
zopf 𝕫
Zopf ℤ
zscr 𝓏
Zscr 𝒵
zwj
zwnj
);
my %spac= ( ## to be processed
##ensp
##emsp
##emsp13
##emsp14
##numsp
##puncsp
##thinsp
##ThinSpace
##hairsp
##VeryThinSpace
##ZeroWidthSpace
##NegativeVeryThinSpace
##NegativeThinSpace
##NegativeMediumSpace
##NegativeThickSpace
##MediumSpace
##NoBreak
##ApplyFunction
##af
##InvisibleTimes
##it
##InvisibleComma
##ic
);
#
# TODO
# - vermo-nos livre de qualquer < ou > perdidos...
#
our ($name, $listofpairs, $comm, $noimg, $tag, $nolatin1, $breakby,
$nosentbreak, $getalt,
$textouput, $latin1output, $isutf8,
$breakbyemptyline,
$breakbynl, $inlinetags,
$txt, $txtll, $indentedpar);
my @breakby = qw( table tr td th p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd
div blockquote hr address center form input noscript label
thead tbody caption optgroup legend pre textarea option
fieldset article main ); #select
my @removtag = qw(col body html em font a tt small strong area map
span iframe abbr big dir link select);
my @inlinetag = qw(sup sub b i u);
my @remov = qw(object marquee frameset head meta script map area style svg
button nav);
my $rawbreakby="";
my $o =":utf8";
$o = ":iso-8859-1" if $latin1output;
$tag = "p" if not $tag;
push @removtag, "img" if $noimg;
push @removtag, @inlinetag unless $inlinetags;
push @breakby, $breakby if $breakby;
$rawbreakby = '|\n[ \t]*\n' if $breakbyemptyline || $txt;
$rawbreakby = '|\n(?: {4,}|\t| *\n)(?:\s*)' if $indentedpar;
$rawbreakby = '|[ \t]*\n' if $breakbynl || $txtll;
my $patremovtag = q{?(?:} .
join('|', @removtag) .
q{)\b(?:=\"[^"]{1,80}\"|=\'[^']{1,80}\'|[^>])*>};
my $patremov = '<(' . join('|', @remov) . ')\b[^>]*>(.|\n)*?\1>';
my $patsep = '\s*(?:?(?:'
. join('|', @breakby)
. ')\b[^>]*>\s*'
. $rawbreakby
. ')+';
if ($listofpairs) {
my $corpus1 = "$name.A.out";
my $corpus2 = "$name.B.out";
if (scalar(@ARGV) == 3) {
$corpus2 = pop @ARGV;
$corpus1 = pop @ARGV;
}
open A, ">$corpus1" or die "Error creating file [$corpus1]: $!";
open B, ">$corpus2" or die "Error creating file [$corpus2]: $!";
binmode A, $o;
binmode B, $o;
while (<>) {
my ($a,$b) = m!(.*?)\t(.*)! or die("invalid lines");
print STDERR "($a)($b)\n";
next if ($a =~ /\.pdf$/ or $b =~ /\.pdf$/ );
$id++;
print A " id='$id' name='$a' 111 222 333 444 555 666 777 888 999 id='$id' name='$b' 111 222 333 444 555 666 777 888 999
)\s*((?:$pinlinetag)>)!$2$1!g or $text =~ s!($pinlinetag)>\s*<\1>! !g or $text =~ s!<($pinlinetag)>\s*\1>! !g) {} $text =~ s!\h{2,}! !g; $text =~ s!
\s*
\n!!g; print STDERR $text; return $text; } sub mind_the_sync_tag{ my $text= shift; if($text =~ s!(<$tag>)(
=head1 SYNOPSIS
html2pml [-tag=...] [options] file
html2pml -listofpairs [-tag=...] [options] file
=head1 DESCRIPTION
C " markcup language - only use tags P)
with the independent segments, after dividing them in sentences.
It was designed to help in the process of aligning texts.
The command C )
C<-noimg> - remove IMG tags (default keep them)
C<-nosentbreak> - don't use xmlsentences to break inside paragraphs
C<-breakby=tag> - use C