#!/usr/bin/perl -s use Encode; use Lingua::PT::PLNbase; use utf8::all; my %ent = ### html5 entities qw( aacute á Aacute Á abreve ă Abreve Ă ac ∾ acd ∿ acirc â Acirc  acute ´ acy а Acy А aelig æ AElig Æ afr 𝔞 Afr 𝔄 agrave à Agrave À alefsym ℵ aleph ℵ alpha α Alpha Α amacr ā Amacr Ā amalg ⨿ amp & and ∧ And ⩓ andand ⩕ andd ⩜ andslope ⩘ andv ⩚ ang ∠ ange ⦤ angle ∠ angmsd ∡ angmsdaa ⦨ angmsdab ⦩ angmsdac ⦪ angmsdad ⦫ angmsdae ⦬ angmsdaf ⦭ angmsdag ⦮ angmsdah ⦯ angrt ∟ angrtvb ⊾ angrtvbd ⦝ angsph ∢ angst Å angzarr ⍼ aogon ą Aogon Ą aopf 𝕒 Aopf 𝔸 ap ≈ apacir ⩯ ape ≊ apE ⩰ apid ≋ apos ' approx ≈ approxeq ≊ aring å Aring Å ascr 𝒶 Ascr 𝒜 Assign ≔ ast * asymp ≈ asympeq ≍ atilde ã Atilde à auml ä Auml Ä awconint ∳ awint ⨑ backcong ≌ backepsilon ϶ backprime ‵ backsim ∽ backsimeq ⋍ Backslash ∖ Barv ⫧ barvee ⊽ barwed ⌅ Barwed ⌆ barwedge ⌅ bbrk ⎵ bbrktbrk ⎶ bcong ≌ bcy б Bcy Б bdquo „ becaus ∵ because ∵ Because ∵ bemptyv ⦰ bepsi ϶ bernou ℬ Bernoullis ℬ beta β Beta Β beth ℶ between ≬ bfr 𝔟 Bfr 𝔅 bigcap ⋂ bigcirc ◯ bigcup ⋃ bigodot ⨀ bigoplus ⨁ bigotimes ⨂ bigsqcup ⨆ bigstar ★ bigtriangledown ▽ bigtriangleup △ biguplus ⨄ bigvee ⋁ bigwedge ⋀ bkarow ⤍ blacklozenge ⧫ blacksquare ▪ blacktriangle ▴ blacktriangledown ▾ blacktriangleleft ◂ blacktriangleright ▸ blank ␣ blk12 ▒ blk14 ░ blk34 ▓ block █ bnot ⌐ bNot ⫭ bopf 𝕓 Bopf 𝔹 bot ⊥ bottom ⊥ bowtie ⋈ boxbox ⧉ boxdl ┐ boxdL ╕ boxDl ╖ boxDL ╗ boxdr ┌ boxdR ╒ boxDr ╓ boxDR ╔ boxh ─ boxH ═ boxhd ┬ boxhD ╥ boxHd ╤ boxHD ╦ boxhu ┴ boxhU ╨ boxHu ╧ boxHU ╩ boxminus ⊟ boxplus ⊞ boxtimes ⊠ boxul ┘ boxuL ╛ boxUl ╜ boxUL ╝ boxur └ boxuR ╘ boxUr ╙ boxUR ╚ boxv │ boxV ║ boxvh ┼ boxvH ╪ boxVh ╫ boxVH ╬ boxvl ┤ boxvL ╡ boxVl ╢ boxVL ╣ boxvr ├ boxvR ╞ boxVr ╟ boxVR ╠ bprime ‵ breve ˘ Breve ˘ brvbar ¦ bscr 𝒷 Bscr ℬ bsemi ⁏ bsim ∽ bsime ⋍ bsol \ bsolb ⧅ bull • bullet • bump ≎ bumpe ≏ bumpE ⪮ bumpeq ≏ Bumpeq ≎ cacute ć Cacute Ć cap ∩ Cap ⋒ capand ⩄ capbrcup ⩉ capcap ⩋ capcup ⩇ capdot ⩀ CapitalDifferentialD ⅅ caret ⁁ caron ˇ Cayleys ℭ ccaps ⩍ ccaron č Ccaron Č ccedil ç Ccedil Ç ccirc ĉ Ccirc Ĉ Cconint ∰ ccups ⩌ ccupssm ⩐ cdot ċ Cdot Ċ cedil ¸ Cedilla ¸ cemptyv ⦲ cent ¢ centerdot · CenterDot · cfr 𝔠 Cfr ℭ chcy ч CHcy Ч check ✓ checkmark ✓ chi χ Chi Χ cir ○ circ ˆ circeq ≗ circlearrowleft ↺ circlearrowright ↻ circledast ⊛ circledcirc ⊚ circleddash ⊝ CircleDot ⊙ circledR ® circledS Ⓢ CircleMinus ⊖ CirclePlus ⊕ CircleTimes ⊗ cire ≗ cirE ⧃ cirfnint ⨐ cirmid ⫯ cirscir ⧂ ClockwiseContourIntegral ∲ CloseCurlyDoubleQuote ” CloseCurlyQuote ’ clubs ♣ clubsuit ♣ colon : Colon ∷ colone ≔ Colone ⩴ coloneq ≔ comma , commat @ comp ∁ compfn ∘ complement ∁ complexes ℂ cong ≅ congdot ⩭ Congruent ≡ conint ∮ Conint ∯ ContourIntegral ∮ copf 𝕔 Copf ℂ coprod ∐ Coproduct ∐ copy © COPY © copysr ℗ CounterClockwiseContourIntegral ∳ crarr ↵ cross ✗ Cross ⨯ cscr 𝒸 Cscr 𝒞 csub ⫏ csube ⫑ csup ⫐ csupe ⫒ ctdot ⋯ cudarrl ⤸ cudarrr ⤵ cuepr ⋞ cuesc ⋟ cularr ↶ cularrp ⤽ cup ∪ Cup ⋓ cupbrcap ⩈ cupcap ⩆ CupCap ≍ cupcup ⩊ cupdot ⊍ cupor ⩅ curarr ↷ curarrm ⤼ curlyeqprec ⋞ curlyeqsucc ⋟ curlyvee ⋎ curlywedge ⋏ curren ¤ curvearrowleft ↶ curvearrowright ↷ cuvee ⋎ cuwed ⋏ cwconint ∲ cwint ∱ cylcty ⌭ dagger † Dagger ‡ daleth ℸ darr ↓ dArr ⇓ Darr ↡ dash ‐ dashv ⊣ Dashv ⫤ dbkarow ⤏ dblac ˝ dcaron ď Dcaron Ď dcy д Dcy Д dd ⅆ DD ⅅ ddagger ‡ ddarr ⇊ DDotrahd ⤑ ddotseq ⩷ deg ° Del ∇ delta δ Delta Δ demptyv ⦱ dfisht ⥿ dfr 𝔡 Dfr 𝔇 dHar ⥥ dharl ⇃ dharr ⇂ DiacriticalAcute ´ DiacriticalDot ˙ DiacriticalDoubleAcute ˝ DiacriticalGrave ` DiacriticalTilde ˜ diam ⋄ diamond ⋄ Diamond ⋄ diamondsuit ♦ diams ♦ die ¨ DifferentialD ⅆ digamma ϝ disin ⋲ div ÷ divide ÷ divideontimes ⋇ divonx ⋇ djcy ђ DJcy Ђ dlcorn ⌞ dlcrop ⌍ dollar $ dopf 𝕕 Dopf 𝔻 dot ˙ Dot ¨ doteq ≐ doteqdot ≑ DotEqual ≐ dotminus ∸ dotplus ∔ dotsquare ⊡ doublebarwedge ⌆ DoubleContourIntegral ∯ DoubleDot ¨ DoubleDownArrow ⇓ DoubleLeftArrow ⇐ DoubleLeftRightArrow ⇔ DoubleLeftTee ⫤ DoubleLongLeftArrow ⟸ DoubleLongLeftRightArrow ⟺ DoubleLongRightArrow ⟹ DoubleRightArrow ⇒ DoubleRightTee ⊨ DoubleUpArrow ⇑ DoubleUpDownArrow ⇕ DoubleVerticalBar ∥ downarrow ↓ Downarrow ⇓ DownArrow ↓ DownArrowBar ⤓ DownArrowUpArrow ⇵ downdownarrows ⇊ downharpoonleft ⇃ downharpoonright ⇂ DownLeftRightVector ⥐ DownLeftTeeVector ⥞ DownLeftVector ↽ DownLeftVectorBar ⥖ DownRightTeeVector ⥟ DownRightVector ⇁ DownRightVectorBar ⥗ DownTee ⊤ DownTeeArrow ↧ drbkarow ⤐ drcorn ⌟ drcrop ⌌ dscr 𝒹 Dscr 𝒟 dscy ѕ DScy Ѕ dsol ⧶ dstrok đ Dstrok Đ dtdot ⋱ dtri ▿ dtrif ▾ duarr ⇵ duhar ⥯ dwangle ⦦ dzcy џ DZcy Џ dzigrarr ⟿ eacute é Eacute É easter ⩮ ecaron ě Ecaron Ě ecir ≖ ecirc ê Ecirc Ê ecolon ≕ ecy э Ecy Э eDDot ⩷ eDot ≑ edot ė Edot Ė ee ⅇ efDot ≒ efr 𝔢 Efr 𝔈 eg ⪚ egrave è Egrave È egs ⪖ egsdot ⪘ el ⪙ Element ∈ elinters ⏧ ell ℓ els ⪕ elsdot ⪗ emacr ē Emacr Ē empty ∅ emptyset ∅ EmptySmallSquare ◻ emptyv ∅ EmptyVerySmallSquare ▫ eng ŋ ENG Ŋ eogon ę Eogon Ę eopf 𝕖 Eopf 𝔼 epar ⋕ eparsl ⧣ eplus ⩱ epsi ϵ epsilon ε Epsilon Ε epsiv ε eqcirc ≖ eqcolon ≕ eqsim ≂ eqslantgtr ⪖ eqslantless ⪕ Equal ⩵ EqualTilde ≂ equest ≟ Equilibrium ⇌ equiv ≡ equivDD ⩸ eqvparsl ⧥ erarr ⥱ erDot ≓ escr ℯ Escr ℰ esdot ≐ esim ≂ Esim ⩳ eta η Eta Η eth ð ETH Ð euml ë Euml Ë euro € excl ! exist ∃ Exists ∃ expectation ℰ exponentiale ⅇ ExponentialE ⅇ fallingdotseq ≒ fcy ф Fcy Ф female ♀ ffilig ffi fflig ff ffllig ffl ffr 𝔣 Ffr 𝔉 filig fi FilledSmallSquare ◼ FilledVerySmallSquare ▪ flat ♭ fllig fl fltns ▱ fnof ƒ fopf 𝕗 Fopf 𝔽 forall ∀ ForAll ∀ fork ⋔ forkv ⫙ Fouriertrf ℱ fpartint ⨍ frac12 ½ frac13 ⅓ frac14 ¼ frac15 ⅕ frac16 ⅙ frac18 ⅛ frac23 ⅔ frac25 ⅖ frac34 ¾ frac35 ⅗ frac38 ⅜ frac45 ⅘ frac56 ⅚ frac58 ⅝ frac78 ⅞ frasl ⁄ frown ⌢ fscr 𝒻 Fscr ℱ gacute ǵ gammad ϝ Gammad Ϝ gamma γ Gamma Γ gap ⪆ gbreve ğ Gbreve Ğ Gcedil Ģ gcirc ĝ Gcirc Ĝ gcy г Gcy Г gdot ġ Gdot Ġ ge ≥ gE ≧ gel ⋛ gEl ⪌ geq ≥ geqq ≧ geqslant ⩾ ges ⩾ gescc ⪩ gesdot ⪀ gesdoto ⪂ gesdotol ⪄ gesles ⪔ gfr 𝔤 Gfr 𝔊 gg ≫ Gg ⋙ ggg ⋙ gimel ℷ gjcy ѓ GJcy Ѓ gl ≷ gla ⪥ glE ⪒ glj ⪤ gnap ⪊ gnapprox ⪊ gne ⪈ gnE ≩ gneq ⪈ gneqq ≩ gnsim ⋧ gopf 𝕘 Gopf 𝔾 grave ` GreaterEqual ≥ GreaterEqualLess ⋛ GreaterFullEqual ≧ GreaterGreater ⪢ GreaterLess ≷ GreaterSlantEqual ⩾ GreaterTilde ≳ gscr ℊ Gscr 𝒢 gsim ≳ gsime ⪎ gsiml ⪐ Gt ≫ gtcc ⪧ gtcir ⩺ gtdot ⋗ gt > gtlPar ⦕ gtquest ⩼ gtrapprox ⪆ gtrarr ⥸ gtrdot ⋗ gtreqless ⋛ gtreqqless ⪌ gtrless ≷ gtrsim ≳ Hacek ˇ half ½ hamilt ℋ hardcy ъ HARDcy Ъ harr ↔ hArr ⇔ harrcir ⥈ harrw ↭ Hat ^ hbar ℏ hcirc ĥ Hcirc Ĥ hearts ♥ heartsuit ♥ hellip … hercon ⊹ hfr 𝔥 Hfr ℌ HilbertSpace ℋ hksearow ⤥ hkswarow ⤦ hoarr ⇿ homtht ∻ hookleftarrow ↩ hookrightarrow ↪ hopf 𝕙 Hopf ℍ horbar ― HorizontalLine ─ hscr 𝒽 Hscr ℋ hslash ℏ hstrok ħ Hstrok Ħ HumpDownHump ≎ HumpEqual ≏ hybull ⁃ hyphen ‐ iacute í Iacute Í icirc î Icirc Î icy и Icy И Idot İ iecy е IEcy Е iexcl ¡ iff ⇔ ifr 𝔦 Ifr ℑ igrave ì Igrave Ì ii ⅈ iiiint ⨌ iiint ∭ iinfin ⧜ iiota ℩ ijlig ij IJlig IJ Im ℑ imacr ī Imacr Ī image ℑ ImaginaryI ⅈ imagline ℐ imagpart ℑ imath ı imof ⊷ imped Ƶ Implies ⇒ in ∈ incare ℅ infin ∞ infintie ⧝ inodot ı int ∫ Int ∬ intcal ⊺ integers ℤ Integral ∫ intercal ⊺ Intersection ⋂ intlarhk ⨗ intprod ⨼ iocy ё IOcy Ё iogon į Iogon Į iopf 𝕚 Iopf 𝕀 iota ι Iota Ι iprod ⨼ iquest ¿ iscr 𝒾 Iscr ℐ isin ∈ isindot ⋵ isinE ⋹ isins ⋴ isinsv ⋳ isinv ∈ itilde ĩ Itilde Ĩ iukcy і Iukcy І iuml ï Iuml Ï jcirc ĵ Jcirc Ĵ jcy й Jcy Й jfr 𝔧 Jfr 𝔍 jmath ȷ jopf 𝕛 Jopf 𝕁 jscr 𝒿 Jscr 𝒥 jsercy ј Jsercy Ј jukcy є Jukcy Є kappav ϰ kappa κ Kappa Κ kcedil ķ Kcedil Ķ kcy к Kcy К kfr 𝔨 Kfr 𝔎 kgreen ĸ khcy х KHcy Х kjcy ќ KJcy Ќ kopf 𝕜 Kopf 𝕂 kscr 𝓀 Kscr 𝒦 lAarr ⇚ lacute ĺ Lacute Ĺ laemptyv ⦴ lagran ℒ lambda λ Lambda Λ lang ⟨ Lang ⟪ langd ⦑ langle ⟨ lap ⪅ Laplacetrf ℒ laquo « larr ← lArr ⇐ Larr ↞ larrb ⇤ larrbfs ⤟ larrfs ⤝ larrhk ↩ larrlp ↫ larrpl ⤹ larrsim ⥳ larrtl ↢ lat ⪫ latail ⤙ lAtail ⤛ late ⪭ lbarr ⤌ lBarr ⤎ lbbrk ❲ lbrace { lbrack [ lbrke ⦋ lbrksld ⦏ lbrkslu ⦍ lcaron ľ Lcaron Ľ lcedil ļ Lcedil Ļ lceil ⌈ lcub { lcy л Lcy Л ldca ⤶ ldquo “ ldquor „ ldrdhar ⥧ ldrushar ⥋ ldsh ↲ le ≤ lE ≦ LeftAngleBracket ⟨ leftarrow ← Leftarrow ⇐ LeftArrow ← LeftArrowBar ⇤ LeftArrowRightArrow ⇆ leftarrowtail ↢ LeftCeiling ⌈ LeftDoubleBracket ⟦ LeftDownTeeVector ⥡ LeftDownVector ⇃ LeftDownVectorBar ⥙ LeftFloor ⌊ leftharpoondown ↽ leftharpoonup ↼ leftleftarrows ⇇ leftrightarrow ↔ Leftrightarrow ⇔ LeftRightArrow ↔ leftrightarrows ⇆ leftrightharpoons ⇋ leftrightsquigarrow ↭ LeftRightVector ⥎ LeftTee ⊣ LeftTeeArrow ↤ LeftTeeVector ⥚ leftthreetimes ⋋ LeftTriangle ⊲ LeftTriangleBar ⧏ LeftTriangleEqual ⊴ LeftUpDownVector ⥑ LeftUpTeeVector ⥠ LeftUpVector ↿ LeftUpVectorBar ⥘ LeftVector ↼ LeftVectorBar ⥒ leg ⋚ lEg ⪋ leq ≤ leqq ≦ leqslant ⩽ les ⩽ lescc ⪨ lesdot ⩿ lesdoto ⪁ lesdotor ⪃ lesges ⪓ lessapprox ⪅ lessdot ⋖ lesseqgtr ⋚ lesseqqgtr ⪋ LessEqualGreater ⋚ LessFullEqual ≦ LessGreater ≶ lessgtr ≶ LessLess ⪡ lesssim ≲ LessSlantEqual ⩽ LessTilde ≲ lfisht ⥼ lfloor ⌊ lfr 𝔩 Lfr 𝔏 lg ≶ lgE ⪑ lHar ⥢ lhard ↽ lharu ↼ lharul ⥪ lhblk ▄ ljcy љ LJcy Љ ll ≪ Ll ⋘ llarr ⇇ llcorner ⌞ Lleftarrow ⇚ llhard ⥫ lltri ◺ lmidot ŀ Lmidot Ŀ lmoust ⎰ lmoustache ⎰ lnap ⪉ lnapprox ⪉ lne ⪇ lnE ≨ lneq ⪇ lneqq ≨ lnsim ⋦ loang ⟬ loarr ⇽ lobrk ⟦ longleftarrow ⟵ Longleftarrow ⟸ LongLeftArrow ⟵ longleftrightarrow ⟷ Longleftrightarrow ⟺ LongLeftRightArrow ⟷ longmapsto ⟼ longrightarrow ⟶ Longrightarrow ⟹ LongRightArrow ⟶ looparrowleft ↫ looparrowright ↬ lopar ⦅ lopf 𝕝 Lopf 𝕃 loplus ⨭ lotimes ⨴ lowast ∗ lowbar _ LowerLeftArrow ↙ LowerRightArrow ↘ loz ◊ lozenge ◊ lozf ⧫ lpar ( lparlt ⦓ lrarr ⇆ lrcorner ⌟ lrhar ⇋ lrhard ⥭ lrm ‎ lrtri ⊿ lsaquo ‹ lscr 𝓁 Lscr ℒ lsh ↰ Lsh ↰ lsim ≲ lsime ⪍ lsimg ⪏ lsqb [ lsquo ‘ lsquor ‚ lstrok ł Lstrok Ł Lt ≪ ltcc ⪦ ltcir ⩹ ltdot ⋖ lthree ⋋ ltimes ⋉ ltlarr ⥶ lt < ltquest ⩻ ltri ◃ ltrie ⊴ ltrif ◂ ltrPar ⦖ lurdshar ⥊ luruhar ⥦ macr ¯ male ♂ malt ✠ maltese ✠ map ↦ Map ⤅ mapsto ↦ mapstodown ↧ mapstoleft ↤ mapstoup ↥ marker ▮ mcomma ⨩ mcy м Mcy М mdash — mDDot ∺ measuredangle ∡ Mellintrf ℳ mfr 𝔪 Mfr 𝔐 mho ℧ micro µ mid ∣ midast * midcir ⫰ middot · minus - minusb ⊟ minusd ∸ minusdu ⨪ MinusPlus ∓ mlcp ⫛ mldr … mnplus ∓ models ⊧ mopf 𝕞 Mopf 𝕄 mp ∓ mscr 𝓂 Mscr ℳ mstpos ∾ multimap ⊸ mumap ⊸ mu μ Mu Μ nabla ∇ nacute ń Nacute Ń nap ≉ napos ʼn napprox ≉ natur ♮ natural ♮ naturals ℕ nbsp _SPACE_ ncap ⩃ ncaron ň Ncaron Ň ncedil ņ Ncedil Ņ ncong ≇ ncup ⩂ ncy н Ncy Н ndash – ne ≠ nearhk ⤤ nearr ↗ neArr ⇗ nearrow ↗ nequiv ≢ nesear ⤨ NestedGreaterGreater ≫ NestedLessLess ≪ nexist ∄ nexists ∄ nfr 𝔫 Nfr 𝔑 nge ≱ ngeq ≱ ngsim ≵ ngt ≯ ngtr ≯ nharr ↮ nhArr ⇎ nhpar ⫲ ni ∋ nis ⋼ nisd ⋺ niv ∋ njcy њ NJcy Њ nlarr ↚ nlArr ⇍ nldr ‥ nle ≰ nleftarrow ↚ nLeftarrow ⇍ nleftrightarrow ↮ nLeftrightarrow ⇎ nleq ≰ nless ≮ nlsim ≴ nlt ≮ nltri ⋪ nltrie ⋬ nmid ∤ nopf 𝕟 Nopf ℕ not ¬ Not ⫬ NotCongruent ≢ NotCupCap ≭ NotDoubleVerticalBar ∦ NotElement ∉ NotEqual ≠ NotExists ∄ NotGreater ≯ NotGreaterEqual ≱ NotGreaterLess ≹ NotGreaterTilde ≵ notin ∉ notinva ∉ notinvb ⋷ notinvc ⋶ NotLeftTriangle ⋪ NotLeftTriangleEqual ⋬ NotLess ≮ NotLessEqual ≰ NotLessGreater ≸ NotLessTilde ≴ notni ∌ notniva ∌ notnivb ⋾ notnivc ⋽ NotPrecedes ⊀ NotPrecedesSlantEqual ⋠ NotReverseElement ∌ NotRightTriangle ⋫ NotRightTriangleEqual ⋭ NotSquareSubsetEqual ⋢ NotSquareSupersetEqual ⋣ NotSubsetEqual ⊈ NotSucceeds ⊁ NotSucceedsSlantEqual ⋡ NotSupersetEqual ⊉ NotTilde ≁ NotTildeEqual ≄ NotTildeFullEqual ≇ NotTildeTilde ≉ NotVerticalBar ∤ npar ∦ nparallel ∦ npolint ⨔ npr ⊀ nprcue ⋠ nprec ⊀ nrarr ↛ nrArr ⇏ nrightarrow ↛ nRightarrow ⇏ nrtri ⋫ nrtrie ⋭ nsc ⊁ nsccue ⋡ nscr 𝓃 Nscr 𝒩 nshortmid ∤ nshortparallel ∦ nsim ≁ nsime ≄ nsimeq ≄ nsmid ∤ nspar ∦ nsqsube ⋢ nsqsupe ⋣ nsub ⊄ nsube ⊈ nsubseteq ⊈ nsucc ⊁ nsup ⊅ nsupe ⊉ nsupseteq ⊉ ntgl ≹ ntilde ñ Ntilde Ñ ntlg ≸ ntriangleleft ⋪ ntrianglelefteq ⋬ ntriangleright ⋫ ntrianglerighteq ⋭ num # numero № nu ν Nu Ν nvdash ⊬ nvDash ⊭ nVdash ⊮ nVDash ⊯ nvHarr ⤄ nvinfin ⧞ nvlArr ⤂ nvrArr ⤃ nwarhk ⤣ nwarr ↖ nwArr ⇖ nwarrow ↖ nwnear ⤧ oacute ó Oacute Ó oast ⊛ ocir ⊚ ocirc ô Ocirc Ô ocy о Ocy О odash ⊝ odblac ő Odblac Ő odiv ⨸ odot ⊙ odsold ⦼ oelig œ OElig Œ ofcir ⦿ ofr 𝔬 Ofr 𝔒 ogon ˛ ograve ò Ograve Ò ogt ⧁ ohbar ⦵ ohm Ω oint ∮ olarr ↺ olcir ⦾ olcross ⦻ oline ‾ olt ⧀ omacr ō Omacr Ō omega ω Omega Ω omicron ο Omicron Ο omid ⦶ ominus ⊖ oopf 𝕠 Oopf 𝕆 opar ⦷ OpenCurlyDoubleQuote “ OpenCurlyQuote ‘ operp ⦹ oplus ⊕ or ∨ Or ⩔ orarr ↻ ord ⩝ order ℴ orderof ℴ ordf ª ordm º origof ⊶ oror ⩖ orslope ⩗ orv ⩛ oS Ⓢ oscr ℴ Oscr 𝒪 oslash ø Oslash Ø osol ⊘ otilde õ Otilde Õ otimes ⊗ Otimes ⨷ otimesas ⨶ ouml ö Ouml Ö ovbar ⌽ OverBar ¯ OverBrace ⏞ OverBracket ⎴ OverParenthesis ⏜ par ∥ para ¶ parallel ∥ parsim ⫳ parsl ⫽ part ∂ PartialD ∂ pcy п Pcy П percnt % period . permil ‰ perp ⊥ pertenk ‱ pfr 𝔭 Pfr 𝔓 phiv φ phi φ Phi Φ phmmat ℳ phone ☎ pitchfork ⋔ piv ϖ pi π Pi Π planck ℏ planckh ℎ plankv ℏ plus + plusacir ⨣ plusb ⊞ pluscir ⨢ plusdo ∔ plusdu ⨥ pluse ⩲ PlusMinus ± plusmn ± plussim ⨦ plustwo ⨧ pm ± Poincareplane ℌ pointint ⨕ popf 𝕡 Popf ℙ pound £ pr ≺ Pr ⪻ prap ⪷ prcue ≼ pre ⪯ prE ⪳ prec ≺ precapprox ⪷ preccurlyeq ≼ Precedes ≺ PrecedesEqual ⪯ PrecedesSlantEqual ≼ PrecedesTilde ≾ preceq ⪯ precnapprox ⪹ precneqq ⪵ precnsim ⋨ precsim ≾ prime ′ Prime ″ primes ℙ prnap ⪹ prnE ⪵ prnsim ⋨ prod ∏ Product ∏ profalar ⌮ profline ⌒ profsurf ⌓ prop ∝ Proportion ∷ Proportional ∝ propto ∝ prsim ≾ prurel ⊰ pscr 𝓅 Pscr 𝒫 psi ψ Psi Ψ qfr 𝔮 Qfr 𝔔 qint ⨌ qopf 𝕢 Qopf ℚ qprime ⁗ qscr 𝓆 Qscr 𝒬 quaternions ℍ quatint ⨖ quest ? questeq ≟ quot " QUOT " rAarr ⇛ race ⧚ racute ŕ Racute Ŕ radic √ raemptyv ⦳ rang ⟩ Rang ⟫ rangd ⦒ range ⦥ rangle ⟩ raquo » rarr → rArr ⇒ Rarr ↠ rarrap ⥵ rarrb ⇥ rarrbfs ⤠ rarrc ⤳ rarrfs ⤞ rarrhk ↪ rarrlp ↬ rarrpl ⥅ rarrsim ⥴ rarrtl ↣ Rarrtl ⤖ rarrw ↝ ratail ⤚ rAtail ⤜ ratio ∶ rationals ℚ rbarr ⤍ rBarr ⤏ RBarr ⤐ rbbrk ❳ rbrace } rbrack ] rbrke ⦌ rbrksld ⦎ rbrkslu ⦐ rcaron ř Rcaron Ř rcedil ŗ Rcedil Ŗ rceil ⌉ rcub } rcy р Rcy Р rdca ⤷ rdldhar ⥩ rdquo ” rdquor ” rdsh ↳ Re ℜ real ℜ realine ℛ realpart ℜ reals ℝ rect ▭ reg ® REG ® ReverseElement ∋ ReverseEquilibrium ⇋ ReverseUpEquilibrium ⥯ rfisht ⥽ rfloor ⌋ rfr 𝔯 Rfr ℜ rHar ⥤ rhard ⇁ rharu ⇀ rharul ⥬ rhov ϱ rho ρ Rho Ρ RightAngleBracket ⟩ rightarrow → Rightarrow ⇒ RightArrow → RightArrowBar ⇥ RightArrowLeftArrow ⇄ rightarrowtail ↣ RightCeiling ⌉ RightDoubleBracket ⟧ RightDownTeeVector ⥝ RightDownVector ⇂ RightDownVectorBar ⥕ RightFloor ⌋ rightharpoondown ⇁ rightharpoonup ⇀ rightleftarrows ⇄ rightleftharpoons ⇌ rightrightarrows ⇉ rightsquigarrow ↝ RightTee ⊢ RightTeeArrow ↦ RightTeeVector ⥛ rightthreetimes ⋌ RightTriangle ⊳ RightTriangleBar ⧐ RightTriangleEqual ⊵ RightUpDownVector ⥏ RightUpTeeVector ⥜ RightUpVector ↾ RightUpVectorBar ⥔ RightVector ⇀ RightVectorBar ⥓ ring ˚ risingdotseq ≓ rlarr ⇄ rlhar ⇌ rlm ‏ rmoust ⎱ rmoustache ⎱ rnmid ⫮ roang ⟭ roarr ⇾ robrk ⟧ ropar ⦆ ropf 𝕣 Ropf ℝ roplus ⨮ rotimes ⨵ RoundImplies ⥰ rpar ) rpargt ⦔ rppolint ⨒ rrarr ⇉ Rrightarrow ⇛ rsaquo › rscr 𝓇 Rscr ℛ rsh ↱ Rsh ↱ rsqb ] rsquo ’ rsquor ’ rthree ⋌ rtimes ⋊ rtri ▹ rtrie ⊵ rtrif ▸ rtriltri ⧎ RuleDelayed ⧴ ruluhar ⥨ rx ℞ sacute ś Sacute Ś sbquo ‚ sc ≻ Sc ⪼ scap ⪸ scaron š Scaron Š sccue ≽ sce ⪰ scE ⪴ scedil ş Scedil Ş scirc ŝ Scirc Ŝ scnap ⪺ scnE ⪶ scnsim ⋩ scpolint ⨓ scsim ≿ scy с Scy С sdot ⋅ sdotb ⊡ sdote ⩦ searhk ⤥ searr ↘ seArr ⇘ searrow ↘ sect § semi ; seswar ⤩ setminus ∖ setmn ∖ sext ✶ sfr 𝔰 Sfr 𝔖 sfrown ⌢ sharp ♯ shchcy щ SHCHcy Щ shcy ш SHcy Ш ShortDownArrow ↓ ShortLeftArrow ← shortmid ∣ shortparallel ∥ ShortRightArrow → ShortUpArrow ↑ sigmaf ς sigmav ς sigma σ Sigma Σ sim ∼ simdot ⩪ sime ≃ simeq ≃ simg ⪞ simgE ⪠ siml ⪝ simlE ⪟ simne ≆ simplus ⨤ simrarr ⥲ slarr ← SmallCircle ∘ smallsetminus ∖ smashp ⨳ smeparsl ⧤ smid ∣ smile ⌣ smt ⪪ smte ⪬ softcy ь SOFTcy Ь sol / solb ⧄ solbar ⌿ sopf 𝕤 Sopf 𝕊 spades ♠ spadesuit ♠ spar ∥ sqcap ⊓ sqcup ⊔ Sqrt √ sqsub ⊏ sqsube ⊑ sqsubset ⊏ sqsubseteq ⊑ sqsup ⊐ sqsupe ⊒ sqsupset ⊐ sqsupseteq ⊒ squ □ square □ Square □ SquareIntersection ⊓ SquareSubset ⊏ SquareSubsetEqual ⊑ SquareSuperset ⊐ SquareSupersetEqual ⊒ SquareUnion ⊔ squarf ▪ squf ▪ srarr → sscr 𝓈 Sscr 𝒮 ssetmn ∖ ssmile ⌣ sstarf ⋆ star ☆ Star ⋆ starf ★ straightepsilon ϵ straightphi ϕ strns ¯ sub ⊂ Sub ⋐ subdot ⪽ sube ⊆ subE ⫅ subedot ⫃ submult ⫁ subne ⊊ subnE ⫋ subplus ⪿ subrarr ⥹ subset ⊂ Subset ⋐ subseteq ⊆ subseteqq ⫅ SubsetEqual ⊆ subsetneq ⊊ subsetneqq ⫋ subsim ⫇ subsub ⫕ subsup ⫓ succ ≻ succapprox ⪸ succcurlyeq ≽ Succeeds ≻ SucceedsEqual ⪰ SucceedsSlantEqual ≽ SucceedsTilde ≿ succeq ⪰ succnapprox ⪺ succneqq ⪶ succnsim ⋩ succsim ≿ SuchThat ∋ sum ∑ Sum ∑ sung ♪ sup ⊃ Sup ⋑ sup1 ¹ sup2 ² sup3 ³ supdot ⪾ supdsub ⫘ supe ⊇ supE ⫆ supedot ⫄ Superset ⊃ SupersetEqual ⊇ suphsub ⫗ suplarr ⥻ supmult ⫂ supne ⊋ supnE ⫌ supplus ⫀ supset ⊃ Supset ⋑ supseteq ⊇ supseteqq ⫆ supsetneq ⊋ supsetneqq ⫌ supsim ⫈ supsub ⫔ supsup ⫖ swarhk ⤦ swarr ↙ swArr ⇙ swarrow ↙ swnwar ⤪ szlig ß target ⌖ tau τ Tau Τ tbrk ⎴ tcaron ť Tcaron Ť tcedil ţ Tcedil Ţ tcy т Tcy Т tdot =⃛ telrec ⌕ tfr 𝔱 Tfr 𝔗 there4 ∴ therefore ∴ Therefore ∴ thetasym ϑ thetav ϑ theta θ Theta Θ thickapprox ≈ thicksim ∼ thkap ≈ thksim ∼ thorn þ THORN Þ tilde ˜ Tilde ∼ TildeEqual ≃ TildeFullEqual ≅ TildeTilde ≈ times × timesb ⊠ timesbar ⨱ timesd ⨰ tint ∭ toea ⤨ top ⊤ topbot ⌶ topcir ⫱ topf 𝕥 Topf 𝕋 topfork ⫚ tosa ⤩ tprime ‴ trade ™ TRADE ™ triangle ▵ triangledown ▿ triangleleft ◃ trianglelefteq ⊴ triangleq ≜ triangleright ▹ trianglerighteq ⊵ tridot ◬ trie ≜ triminus ⨺ TripleDot =⃛ triplus ⨹ trisb ⧍ tritime ⨻ trpezium ⏢ tscr 𝓉 Tscr 𝒯 tscy ц TScy Ц tshcy ћ TSHcy Ћ tstrok ŧ Tstrok Ŧ twixt ≬ twoheadleftarrow ↞ twoheadrightarrow ↠ uacute ú Uacute Ú uarr ↑ uArr ⇑ Uarr ↟ Uarrocir ⥉ ubrcy ў Ubrcy Ў ubreve ŭ Ubreve Ŭ ucirc û Ucirc Û ucy у Ucy У udarr ⇅ udblac ű Udblac Ű udhar ⥮ ufisht ⥾ ufr 𝔲 Ufr 𝔘 ugrave ù Ugrave Ù uHar ⥣ uharl ↿ uharr ↾ uhblk ▀ ulcorn ⌜ ulcorner ⌜ ulcrop ⌏ ultri ◸ umacr ū Umacr Ū uml ¨ UnderBrace ⏟ UnderBracket ⎵ UnderParenthesis ⏝ Union ⋃ UnionPlus ⊎ uogon ų Uogon Ų uopf 𝕦 Uopf 𝕌 uparrow ↑ Uparrow ⇑ UpArrow ↑ UpArrowBar ⤒ UpArrowDownArrow ⇅ updownarrow ↕ Updownarrow ⇕ UpDownArrow ↕ UpEquilibrium ⥮ upharpoonleft ↿ upharpoonright ↾ uplus ⊎ UpperLeftArrow ↖ UpperRightArrow ↗ upsih ϒ upsilon υ Upsilon Υ upsi υ Upsi ϒ UpTee ⊥ UpTeeArrow ↥ upuparrows ⇈ urcorn ⌝ urcorner ⌝ urcrop ⌎ uring ů Uring Ů urtri ◹ uscr 𝓊 Uscr 𝒰 utdot ⋰ utilde ũ Utilde Ũ utri ▵ utrif ▴ uuarr ⇈ uuml ü Uuml Ü uwangle ⦧ vangrt ⦜ varepsilon ε varkappa ϰ varnothing ∅ varphi φ varpi ϖ varpropto ∝ varr ↕ vArr ⇕ varrho ϱ varsigma ς vartheta ϑ vartriangleleft ⊲ vartriangleright ⊳ vBar ⫨ Vbar ⫫ vBarv ⫩ vcy в Vcy В vdash ⊢ vDash ⊨ Vdash ⊩ VDash ⊫ Vdashl ⫦ vee ∨ Vee ⋁ veebar ⊻ veeeq ≚ vellip ⋮ verbar | Verbar ‖ vert | Vert ‖ VerticalBar ∣ VerticalLine | VerticalSeparator ❘ VerticalTilde ≀ vfr 𝔳 Vfr 𝔙 vltri ⊲ vopf 𝕧 Vopf 𝕍 vprop ∝ vrtri ⊳ vscr 𝓋 Vscr 𝒱 Vvdash ⊪ vzigzag ⦚ wcirc ŵ Wcirc Ŵ wedbar ⩟ wedge ∧ Wedge ⋀ wedgeq ≙ weierp ℘ wfr 𝔴 Wfr 𝔚 wopf 𝕨 Wopf 𝕎 wp ℘ wr ≀ wreath ≀ wscr 𝓌 Wscr 𝒲 xcap ⋂ xcirc ◯ xcup ⋃ xdtri ▽ xfr 𝔵 Xfr 𝔛 xharr ⟷ xhArr ⟺ xi ξ Xi Ξ xlarr ⟵ xlArr ⟸ xmap ⟼ xnis ⋻ xodot ⨀ xopf 𝕩 Xopf 𝕏 xoplus ⨁ xotime ⨂ xrarr ⟶ xrArr ⟹ xscr 𝓍 Xscr 𝒳 xsqcup ⨆ xuplus ⨄ xutri △ xvee ⋁ xwedge ⋀ yacute ý Yacute Ý yacy я YAcy Я ycirc ŷ Ycirc Ŷ ycy ы Ycy Ы yen ¥ yfr 𝔶 Yfr 𝔜 yicy ї YIcy Ї yopf 𝕪 Yopf 𝕐 yscr 𝓎 Yscr 𝒴 yucy ю YUcy Ю yuml ÿ Yuml Ÿ zacute ź Zacute Ź zcaron ž Zcaron Ž zcy з Zcy З zdot ż Zdot Ż zeetrf ℨ zeta ζ Zeta Ζ zfr 𝔷 Zfr ℨ zhcy ж ZHcy Ж zigrarr ⇝ zopf 𝕫 Zopf ℤ zscr 𝓏 Zscr 𝒵 zwj ‍ zwnj ‌ ); my %spac= ( ## to be processed ##ensp   ##emsp   ##emsp13   ##emsp14   ##numsp   ##puncsp   ##thinsp   ##ThinSpace   ##hairsp   ##VeryThinSpace   ##ZeroWidthSpace ​ ##NegativeVeryThinSpace ​ ##NegativeThinSpace ​ ##NegativeMediumSpace ​ ##NegativeThickSpace ​ ##MediumSpace   ##NoBreak ⁠ ##ApplyFunction ⁡ ##af ⁡ ##InvisibleTimes ⁢ ##it ⁢ ##InvisibleComma ⁣ ##ic ⁣ ); # # TODO # - vermo-nos livre de qualquer < ou > perdidos... # our ($name, $listofpairs, $comm, $noimg, $tag, $nolatin1, $breakby, $nosentbreak, $getalt, $textouput, $latin1output, $isutf8, $breakbyemptyline, $breakbynl, $inlinetags, $txt, $txtll, $indentedpar); my @breakby = qw( table tr td th p br h1 h2 h3 h4 h5 h6 li ul ol dl dt dd div blockquote hr address center form input noscript label thead tbody caption optgroup legend pre textarea option fieldset article main ); #select my @removtag = qw(col body html em font a tt small strong area map span iframe abbr big dir link select); my @inlinetag = qw(sup sub b i u); my @remov = qw(object marquee frameset head meta script map area style svg button nav); my $rawbreakby=""; my $o =":utf8"; $o = ":iso-8859-1" if $latin1output; $tag = "p" if not $tag; push @removtag, "img" if $noimg; push @removtag, @inlinetag unless $inlinetags; push @breakby, $breakby if $breakby; $rawbreakby = '|\n[ \t]*\n' if $breakbyemptyline || $txt; $rawbreakby = '|\n(?: {4,}|\t| *\n)(?:\s*)' if $indentedpar; $rawbreakby = '|[ \t]*\n' if $breakbynl || $txtll; my $patremovtag = q{])*>}; my $patremov = '<(' . join('|', @remov) . ')\b[^>]*>(.|\n)*?'; my $patsep = '\s*(?:]*>\s*' . $rawbreakby . ')+'; if ($listofpairs) { my $corpus1 = "$name.A.out"; my $corpus2 = "$name.B.out"; if (scalar(@ARGV) == 3) { $corpus2 = pop @ARGV; $corpus1 = pop @ARGV; } open A, ">$corpus1" or die "Error creating file [$corpus1]: $!"; open B, ">$corpus2" or die "Error creating file [$corpus2]: $!"; binmode A, $o; binmode B, $o; while (<>) { my ($a,$b) = m!(.*?)\t(.*)! or die("invalid lines"); print STDERR "($a)($b)\n"; next if ($a =~ /\.pdf$/ or $b =~ /\.pdf$/ ); $id++; print A "\n"; ## XXX, QUE COISA É ESTA? print A "

id='$id' name='$a' 111 222 333 444 555 666 777 888 999

\n"; print A html2p($a); print A "
\n"; print B "\n"; ## XXX, QUE COISA É ESTA? print B "

id='$id' name='$b' 111 222 333 444 555 666 777 888 999

\n"; print B html2p($b); print B "
\n"; } } else { binmode STDOUT, $o; my $head=""; if($o eq ":utf8"){ $head=""; $head= "$head";} else {$head = "";} for (@ARGV) { print "$head\n\n

\n" unless $textoutput; print html2p($_); print "
\n" unless $textoutput; } } sub html2p{ my $f = shift; my %ent_off = # em utf8... qw( aacute á Aacute Á acirc â Acirc  agrave à Agrave À aring å Aring Å atilde ã Atilde à auml ä Auml Ä aelig æ AElig Æ ccedil ç Ccedil Ç eacute é Eacute É ecirc ê Ecirc Ê egrave è Egrave È euml ë Euml Ë iacute í Iacute Í icirc î Icirc Î igrave ì Igrave Ì iuml ï Iuml Ï ntilde ñ Ntilde Ñ oacute ó Oacute Ó ocirc ô Ocirc Ô ograve ò Ograve Ò oslash ø Oslash Ø otilde õ Otilde Õ ouml ö Ouml Ö szlig ß uacute ú Uacute Ú ucirc û Ucirc Û ugrave ù Ugrave Ù uuml ü Uuml Ü yacute ý Yacute Ý yuml ÿ scaron š rcaron ř ecaron ě zcaron ž amacron ā emacron ē imacron ī deg ° ordm º ordf ª copy © quot ' euro € bull • minus - ndash – mdash — hellip … laquo « raquo » lsquo ‘ rsquo ’ sbquo ‚ ldquo “ rdquo ” bdquo „ amp & lt < gt > nbsp _SPACE_ ); my %utf2lat = ( chr(169) => "(c)", "…" => "...", "€" => " Euros ", "ˆ" => "^" , # 2C6 (^mais compacto) "’" => "'" , # 226? "‘" => "'" , # 8216 "‐" => "-" , # 8208 "‑" => "-" , # 8209 non breaking "-" "–" => "--" , # 820A ? "—" => "---", # chr(8722) => "-", # math "-" "“" => "\"" , "”" => "\"" , chr(8206) => "" , # R to L mark?? chr(8207) => "" , # L to R mark?? chr(8236) => "" , # R to L overwrite chr(8237) => "" , # L to R overwrite chr(946) => "ß", # chr(64256) => "ff", # chr(64257) => "fi", # chr(64258) => "fl", # chr(64259) => "ffi", # não juro... chr(8226) => "-*", # item, parecido com (·)?? ); my $r = ""; open F, '<:bytes', $f or die "cant open $f: $!"; # elsif($isutf8){open(F,"recode -f utf8..latin1 < '$f'| recode html..latin1|tee '$f.latin1' |") or die("cant open $f\n")} # else {open(F,"recode -f html..latin1 < '$f'|") or die("cant open $f\n")} local $/; undef $/; $_= ; close F; s/\r\n/\n/g; if (m#http-equiv.*charset=utf-8#i || $isutf8 || m#http-equiv.*charset=unicode-1-1-utf-8#i || m#meta.*charset=["']utf-8["']#i ) { # $_ = decode("utf-8", $_, 0); print STDERR "Itis a UTF8 input\n"; } elsif (m#http-equiv.*charset=iso-8859-1#i || $islatin1 || m#meta.*charset=["']iso-8859-1["']#i ) { # $_ = decode("iso-8859-1", $_, 0); # $_ = decode("CP1252", $_, 0); print STDERR "Itis a LATIN1 input\n"; } elsif (m#http-equiv.*charset=([a-zA-Z0-9_\-]+)#i ) { print STDERR "Input appear to be $1\n"; $_ = decode($1, $_, 0); } else { # XXX - Talvez valha a pena por uma flag para utf8 print STDERR "Input enconding is NOT clear, using utf-8\n"; $_ = decode("utf-8", $_, 0); print STDERR "Debug: before subs.\n"; } s#\n__END__\n.*?\n__BEGIN__\n##sg; s#.*?\n__BEGIN__\n##s; s#\n__END__\n.*##s; s###sg; s###isg; s#<\?xml.*?>##isg; s#\&(\w+);#$ent{$1} || $& #ge; ## expand ´ and friends s#\&\#(\d+);# pack("U",$1) #ge; s#\&\#x([\dA-Fa-f]+);# pack("U",hex($1)) #ge; ### s#\x{A0}# | #g; #A0 - strange character similar to "|" s#$patremovtag# #ig; s#$patremov# #ig; s#src=("[^"\n]{200,}"|'[^'\n]{200,}')#INLINEIMG#ig ; s##IMG $1#ig if $getalt ; if ($latin1output) { s#([\x{100}-\x{ffff}])# $utf2lat{$1} || sprintf('_unicode_%d_',ord($1))#ge; } s#_SPACE_# #ig; s#\x{200b}##g; ## remove zero width spaces ##FIXME??? print STDERR "Debug: before subs.\n"; for (split(/$patsep/i,$_)) { $r .= "\n" if $comm; s/\r/ /g; s/\s*\n\s*/ /g; if ($textoutput) { $r .= "$_\n"; } elsif ($nosentbreak) { $r .= "<$tag>$_\n"; } else { $r .= Lingua::PT::PLNbase::xmlsentences({st=>$tag},$_)."\n"; } } $r=fixinlinetag($r) if $inlinetags; return mind_the_sync_tag($r); } sub fixinlinetag{ my $text= shift; my $pinlinetag= join('|',@inlinetag); $text =~ s!(\s+)()!$2$1!g; $text =~ s!(<(?:$pinlinetag)>)(\s+)!$2$1!g; while( $text =~ s!(

\s*

)\s*()!$2$1!g or $text =~ s!\s*<\1>! !g or $text =~ s!<($pinlinetag)>\s*! !g) {} $text =~ s!\h{2,}! !g; $text =~ s!

\s*

\n!!g; print STDERR $text; return $text; } sub mind_the_sync_tag{ my $text= shift; if($text =~ s!(<$tag>)()!\n$2\n$1!g) { $text =~ s!!!; $text =~ s!<$tag>!!; $text; } else { $text } } __END__ =head1 NAME html2pml - html to list of C

=head1 SYNOPSIS html2pml [-tag=...] [options] file html2pml -listofpairs [-tag=...] [options] file =head1 DESCRIPTION C transforms HTML in PML ("

" markcup language - only use tags P) with the independent segments, after dividing them in sentences. It was designed to help in the process of aligning texts. The command C should be installed in order to be possible to make the conversion to latin1. If present one or more C<__BEGIN__> and / or C<__END__> the text is cuted accordingly before alignment. =head2 With C<-listofpairs> option With C<-listofpairs> option, it accepts a file with lines with 2 filenames separated by a tab, and converts them to PML and makes 2 output files (_Aout and _Bout) with the PLMs. Each file is tagged with .... in order to help in the process of aligning texts. With this option, we can also use C<-name=...> to set the corpus name. =head1 Options C<-comm> - with this option a XML comment is inserted with the removed/transformed tags (for debug) C<-tag=T> - use tag name T (instead of default - C

) C<-noimg> - remove IMG tags (default keep them) C<-nosentbreak> - don't use xmlsentences to break inside paragraphs C<-breakby=tag> - use C as a sentence separator C<-txt> or C<-breakbyemptyline> - use empty lines as paragraph separators C<-txtll> or C<-breakbynl> - use new-line as paragraph separator C<-indentedpar> - use a indented line as a paragraph separator C<-latin1output> - by default Html is converted to UTF-8; use this option to force latin1 output C<-textouput> - remove pml markup C<-isutf8> - force UTF-8 interpretation of the HTML input. by default, html2pml search for "encoding" atribute. C<-inlinetags> - preserve tags i b u sub sup =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). Lingua::PT::PLNbase =cut