mirror of
https://github.com/pkali/piradio-mini.git
synced 2026-05-20 22:34:22 +02:00
New translate class, but not final...
This commit is contained in:
+6
-2
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: latin-1 -*-
|
||||
#
|
||||
# $Id: rss_class.py,v 1.23 2016/06/28 06:40:51 bob Exp $
|
||||
# $Id: rss_class.py,v 1.25 2017/04/26 08:18:25 bob Exp $
|
||||
# Raspberry Pi RSS feed class
|
||||
#
|
||||
# Author : Bob Rathbone
|
||||
@@ -63,9 +63,13 @@ class Rss:
|
||||
self.feed_available = True
|
||||
line = self.rss.pop()
|
||||
self.length -= 1
|
||||
|
||||
line = line.lstrip('<')
|
||||
feed = translate.all(line)
|
||||
feed = feed.lstrip('u"')
|
||||
feed = feed.lstrip("u'")
|
||||
feed = feed.lstrip('"')
|
||||
feed = feed.lstrip('<')
|
||||
feed = feed.rstrip('"')
|
||||
if not self.rss_error:
|
||||
log.message(feed,log.DEBUG)
|
||||
return feed
|
||||
|
||||
+186
-90
@@ -1,10 +1,10 @@
|
||||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: latin-1 -*-
|
||||
#
|
||||
# Raspberry Pi Radio Character translation class
|
||||
# Escaped characters, html and unicode translation to ascii
|
||||
#
|
||||
# $Id: translate_class.py,v 1.24 2016/04/14 06:37:56 bob Exp $
|
||||
# $Id: translate_class.py,v 1.37 2017/05/16 11:48:02 bob Exp $
|
||||
#
|
||||
# Author : Bob Rathbone
|
||||
# Site : http://www.bobrathbone.com
|
||||
@@ -17,10 +17,10 @@
|
||||
# Useful Links on character encodings
|
||||
# http://www.zytrax.com/tech/web/entities.html
|
||||
# http://www.utf8-chartable.de/
|
||||
#
|
||||
# http://www.codetable.net/
|
||||
# http://www.ascii-code.com/
|
||||
|
||||
|
||||
import os
|
||||
import os,sys
|
||||
import time
|
||||
import unicodedata
|
||||
from log_class import Log
|
||||
@@ -35,11 +35,16 @@ class Translate:
|
||||
codes = {
|
||||
'//' : '/', # Double /
|
||||
' ' : ' ', # Double spaces
|
||||
'\\xa0' : ' ', # Line feed to space
|
||||
'\\' : "'", # Double bacslash to apostrophe
|
||||
'\\n' : ' ', # Line feed to space
|
||||
|
||||
# German UTF8 codes
|
||||
'\\xef\\xbf\\xbd' : chr(246),
|
||||
|
||||
# Currencies
|
||||
'\\xe2\\x82\\xac' : ' Euro ',
|
||||
|
||||
# Special characters
|
||||
'\\x80\\x99' : "'", # Single quote
|
||||
'\\xc2\\xa1' : '!', # Inverted exclamation
|
||||
'\\xc2\\xa2' : 'c', # Cent sign
|
||||
'\\xc2\\xa3' : '#', # Pound sign
|
||||
@@ -82,21 +87,22 @@ class Translate:
|
||||
'\\xc3\\x96' : chr(214), # O umlaut
|
||||
'\\xc3\\x9c' : chr(220), # U umlaut
|
||||
|
||||
# Norwegian unicode escape sequences
|
||||
'\\xc3\\x98' : 'O', # Oslash
|
||||
'\\xc3\\xb8' : 'o', # Oslash
|
||||
'\\xc3\\x85' : 'A', # Aring
|
||||
# Scandanavian unicode escape sequences
|
||||
'\\xc2\\x88' : 'A', # aelig
|
||||
'\\xc2\\xb4' : 'A', # aelig
|
||||
'\\xc3\\x85' : 'Aa', # Aring
|
||||
'\\xc3\\x93' : 'O', # O grave
|
||||
'\\xc3\\xa4' : 'a', # a with double dot
|
||||
'\\xc3\\xa5' : 'a', # aring
|
||||
'\\xc3\\x86' : 'AE', # AElig
|
||||
'\\xc3\\x98' : 'O', # O crossed
|
||||
'\\xc3\\x98' : '0', # O crossed
|
||||
'\\xc3\\x99' : 'U', # U grave
|
||||
'\\xc3\\xa6' : 'ae', # aelig
|
||||
'\\xc3\\xb0' : 'o', # o umlaut
|
||||
'\\xc3\\xb3' : 'o', # o tilde
|
||||
'\\xc3\\xb2' : 'o', # o tilde
|
||||
'\\xc3\\xb3' : 'o', # o reverse tilde
|
||||
'\\xc3\\xb4' : 'o', # Capital O circumflex
|
||||
'\\xc3\\xb8' : 'o', # oslash
|
||||
'\\xc2\\x88' : 'A', # aelig
|
||||
'\\xc2\\xb4' : 'A', # aelig
|
||||
|
||||
# French (Latin) unicode escape sequences
|
||||
'\\xc3\\x80' : 'A', # A grave
|
||||
@@ -106,22 +112,26 @@ class Translate:
|
||||
'\\xc3\\x88' : 'E', # E grave
|
||||
'\\xc3\\x89' : 'E', # E acute
|
||||
'\\xc3\\x8a' : 'E', # E circumflex
|
||||
'\\xc3\\xa0' : chr(224), # a grave
|
||||
'\\xc3\\xa1' : chr(225), # a acute
|
||||
'\\xc3\\xa2' : chr(226), # a circumflex
|
||||
'\\xc3\\xa8' : chr(232), # e grave
|
||||
'\\xc3\\xa9' : chr(233), # e acute
|
||||
'\\xc3\\xaa' : chr(234), # e circumflex
|
||||
'\\xc3\\xb6' : "'", # Hyphon
|
||||
'\\xc3\\xa0' : 'a', # a grave
|
||||
'\\xc3\\xa1' : 'a', # a acute
|
||||
'\\xc3\\xa2' : 'a', # a circumflex
|
||||
'\\xc3\\xa7' : 'c', # c cedilla
|
||||
'\\xc3\\xa8' : 'e', # e grave
|
||||
'\\xc3\\xa9' : 'e', # e acute
|
||||
'\\xc3\\xaa' : 'e', # e circumflex
|
||||
'\\xc3\\xab' : 'e', # e diaeresis
|
||||
'\\xc3\\xae' : 'i', # i circumflex
|
||||
'\\xc3\\xaf' : 'i', # i diaeresis
|
||||
'\\xc3\\xb7' : "/", # Division sign
|
||||
'\\xc5\\x93' : 'oe', # oe joined
|
||||
|
||||
# Hungarian lower case
|
||||
'\\xc3\\xb3' : chr(243), #
|
||||
'\\xc3\\xad' : chr(237), #
|
||||
'\\xc3\\xb5' : chr(245), #
|
||||
'\\xc5\\x91' : chr(245), #
|
||||
'\\xc3\\xb3' : 'o', # o circumflex
|
||||
'\\xc3\\xad' : 'i', # i accent
|
||||
'\\xc3\\xb5' : 'o', # o tilde
|
||||
'\\xc5\\x91' : 'o', # o
|
||||
'\\xc5\\xb1' : chr(252), #
|
||||
'\\xc3\\xba' : chr(250), # Ã
|
||||
'\\xc3\\xba' : 'u', # u acute
|
||||
|
||||
# Polish unicode escape sequences
|
||||
'\\xc4\\x84' : 'A', # A,
|
||||
@@ -194,58 +204,116 @@ class Translate:
|
||||
'\\xce\\xc8' : 'ps', # Psi
|
||||
'\\xce\\xc9' : 'o', # Omega
|
||||
|
||||
# Currency other special character
|
||||
'\\xa3' : chr(156), # UK pound sign
|
||||
'\\xa9' : chr(169), # Copyright
|
||||
# Icelandic
|
||||
'\\xc3\\xbe' : 'p', # Like a p with up stroke
|
||||
'\\xc3\\xbd' : 'y', # y diaeresis
|
||||
|
||||
# German short hex representation
|
||||
'\\xdf' : chr(223), # Sharp s es-zett
|
||||
'\\xe4' : chr(228), # a umlaut
|
||||
'\\xf6' : chr(246), # o umlaut
|
||||
'\\xfc' : chr(252), # u umlaut
|
||||
'\\xc4' : chr(196), # A umlaut
|
||||
'\\xd6' : chr(214), # O umlaut
|
||||
'\\xdc' : chr(220), # U umlaut
|
||||
# Italian characters
|
||||
'\\xc3\\xac' : 'i', # i reverse circumflex
|
||||
'\\xc3\\xb9' : 'u', # u reverse circumflex
|
||||
|
||||
# Spanish and French
|
||||
'\\xe0' : chr(224), # Small a reverse acute
|
||||
'\\xe1' : chr(225), # Small a acute
|
||||
'\\xe2' : chr(226), # Small audo bashcircumflex
|
||||
'\\xe7' : chr(231), # Small c Cedilla
|
||||
'\\xe8' : chr(232), # Small e grave
|
||||
'\\xe9' : chr(233), # Small e acute
|
||||
'\\xea' : chr(234), # Small e circumflex
|
||||
'\\xeb' : chr(235), # Small e diarisis
|
||||
'\\xed' : chr(237), # Small i acute
|
||||
'\\xee' : chr(238), # Small i circumflex
|
||||
'\\xf1' : chr(241), # Small n tilde
|
||||
'\\xf3' : chr(243), # Small o acute
|
||||
'\\xf4' : chr(244), # Small o circumflex
|
||||
'\\xf9' : chr(249), # Small u circumflex
|
||||
'\\xfa' : chr(250), # Small u acute
|
||||
'\\xfb' : chr(251), # u circumflex
|
||||
# Polish (not previously covered)
|
||||
'\\xc3\\xa3' : 'a', # a tilde
|
||||
|
||||
'\\xc0' : chr(192), # Small A grave
|
||||
'\\xc1' : chr(193), # Capital A acute
|
||||
# Romanian
|
||||
'\\xc4\\x83' : 'a', # a circumflex variant
|
||||
'\\xc3\\xa2' : 'a', # a circumflex
|
||||
'\\xc3\\xae' : 'i', # i circumflex
|
||||
'\\xc5\\x9f' : 's', # s cedilla ?
|
||||
'\\xc5\\xa3' : 's', # t cedilla ?
|
||||
'\\xc8\\x99' : 's', # s with down stroke
|
||||
'\\xc8\\x9b' : 't', # t with down stroke
|
||||
|
||||
'\\xc7' : chr(199), # Capital C Cedilla
|
||||
'\\xc9' : chr(201), # Capital E acute
|
||||
'\\xcd' : chr(205), # Capital I acute
|
||||
'\\xd3' : chr(211), # Capital O acute
|
||||
'\\xda' : chr(218), # Capital U acute
|
||||
# Spanish not covered above
|
||||
'\\xc3\\xb1' : 'n', # n tilde
|
||||
|
||||
'\\xbf' : chr(191), # Spanish Punctuation
|
||||
|
||||
'xb0' : 'o', # Degres symbol
|
||||
# Turkish not covered above
|
||||
'\\xc3\\xbb' : 'u', # u circumflex
|
||||
'\\xc4\\x9f' : 'g', # g tilde
|
||||
'\\xc4\\xb1' : 'i', # Looks like an i
|
||||
'\\xc4\\xb0' : 'I', # Looks like an I
|
||||
}
|
||||
|
||||
# UTF8 codes (Must be checked after above codes checked)
|
||||
short_codes = {
|
||||
'\\xa0' : ' ', # Line feed to space
|
||||
|
||||
'\\xb4' : "'", # Apostrophe
|
||||
'\\xc0' : 'A', # A
|
||||
'\\xc1' : 'A', # A
|
||||
'\\xc2' : 'A', # A
|
||||
'\\xc3' : 'A', # A
|
||||
'\\xc4' : 'A', # A
|
||||
'\\xc5' : 'A', # A
|
||||
'\\xc6' : 'Ae', # AE
|
||||
'\\xc7' : 'C', # C
|
||||
'\\xc8' : 'E', # E
|
||||
'\\xc9' : 'E', # E
|
||||
'\\xca' : 'E', # E
|
||||
'\\xcb' : 'E', # E
|
||||
'\\xcc' : 'I', # I
|
||||
'\\xcd' : 'I', # I
|
||||
'\\xce' : 'I', # I
|
||||
'\\xcf' : 'I', # I
|
||||
'\\xd0' : 'D', # D
|
||||
'\\xd1' : 'N', # N
|
||||
'\\xd2' : 'O', # O
|
||||
'\\xd3' : 'O', # O
|
||||
'\\xd4' : 'O', # O
|
||||
'\\xd5' : 'O', # O
|
||||
'\\xd6' : 'O', # O
|
||||
'\\xd7' : 'x', # Multiply
|
||||
'\\xd8' : '0', # O crossed
|
||||
'\\xd9' : 'U', # U
|
||||
'\\xda' : 'U', # U
|
||||
'\\xdb' : 'U', # U
|
||||
'\\xdc' : 'U', # U umlaut
|
||||
'\\xdd' : 'Y', # Y
|
||||
'\\xdf' : 'S', # Sharp s es-zett
|
||||
'\\xe0' : 'e', # Small a reverse acute
|
||||
'\\xe1' : 'a', # Small a acute
|
||||
'\\xe2' : 'a', # Small a circumflex
|
||||
'\\xe3' : 'a', # Small a tilde
|
||||
'\\xe4' : 'a', # Small a diaeresis
|
||||
'\\xe5' : 'aa', # Small a ring above
|
||||
'\\xe6' : 'ae', # Joined ae
|
||||
'\\xe7' : 'c', # Small c Cedilla
|
||||
'\\xe8' : 'e', # Small e grave
|
||||
'\\xe9' : 'e', # Small e acute
|
||||
'\\xea' : 'e', # Small e circumflex
|
||||
'\\xeb' : 'e', # Small e diarisis
|
||||
'\\xed' : 'i', # Small i acute
|
||||
'\\xee' : 'i', # Small i circumflex
|
||||
'\\xf1' : 'n', # Small n tilde
|
||||
'\\xf3' : 'o', # Small o acute
|
||||
'\\xf4' : 'o', # Small o circumflex
|
||||
'\\xf6' : 'o', # o umlaut
|
||||
'\\xf7' : '/', # Division sign
|
||||
'\\xf8' : 'oe', # Small o strike through
|
||||
'\\xf9' : 'u', # Small u circumflex
|
||||
'\\xfa' : 'u', # Small u acute
|
||||
'\\xfb' : 'u', # u circumflex
|
||||
'\\xc0' : 'A', # Small A grave
|
||||
'\\xc1' : 'A', # Capital A acute
|
||||
'\\xc7' : 'C', # Capital C Cedilla
|
||||
'\\xc9' : 'E', # Capital E acute
|
||||
'\\xcd' : 'I', # Capital I acute
|
||||
'\\xd3' : 'O', # Capital O acute
|
||||
'\\xda' : 'U', # Capital U acute
|
||||
'\\xfc' : 'u', # u umlaut
|
||||
'\\xbf' : '?', # Spanish Punctuation
|
||||
|
||||
'\\xb0' : 'o', # Degrees symbol
|
||||
}
|
||||
|
||||
# HTML codes (RSS feeds)
|
||||
HtmlCodes = {
|
||||
# Currency
|
||||
chr(156) : '#', # Pound by hash
|
||||
chr(169) : '(c)', # Copyright
|
||||
|
||||
# Norwegian
|
||||
chr(216) : 'O', # Oslash
|
||||
chr(216) : '0', # Oslash
|
||||
|
||||
# Spanish french
|
||||
chr(241) : 'n', # Small tilde n
|
||||
@@ -278,7 +346,8 @@ class Translate:
|
||||
chr(196) : "Ae", # A umlaut
|
||||
chr(214) : "Oe", # O umlaut
|
||||
chr(220) : "Ue", # U umlaut
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
unicodes = {
|
||||
'\\u201e' : '"', # ORF feed
|
||||
@@ -288,34 +357,34 @@ class Translate:
|
||||
'\\u0153' : "oe", # French oe
|
||||
'\\u2009' : ' ', # Short space to space
|
||||
'\\u2013' : '-', # Long dash to minus sign
|
||||
'\\u2019' : "'", # French apostrophe
|
||||
|
||||
'\\u2018' : "'", # Left single quote
|
||||
'\\u2019' : "'", # Right single quote
|
||||
# Polish unicodes (I don't know why, but works :) ) (Pecus)
|
||||
"'u0104" : "A", # A, (Pecus)
|
||||
"'u0105" : "a", # a, (Pecus)
|
||||
"'u0106" : "C", # C' (Pecus)
|
||||
"'u0107" : "c", # c' (Pecus)
|
||||
"'u0118" : "E", # E, (Pecus)
|
||||
"'u0119" : "e", # e, (Pecus)
|
||||
"'u0141" : "L", # L/ (Pecus)
|
||||
"'u0142" : "l", # l/ (Pecus)
|
||||
"'u0143" : "N", # N' (Pecus)
|
||||
"'u0144" : "n", # n' (Pecus)
|
||||
"'xd3" : "O", # O' (Pecus)
|
||||
"'xf3" : "o", # o' (Pecus)
|
||||
"'u015a" : "S", # S' (Pecus)
|
||||
"'u015b" : "s", # s' (Pecus)
|
||||
"'u0179" : "Z", # Z' (Pecus)
|
||||
"'u017a" : "z", # z' (Pecus)
|
||||
"'u017b" : "Z", # Z. (Pecus)
|
||||
"'u017c" : "z", # z. (Pecus)
|
||||
'\\u0104' : "A", # A, (Pecus)
|
||||
'\\u0105' : "a", # a, (Pecus)
|
||||
'\\u0106' : "C", # C' (Pecus)
|
||||
'\\u0107' : "c", # c' (Pecus)
|
||||
'\\u0118' : "E", # E, (Pecus)
|
||||
'\\u0119' : "e", # e, (Pecus)
|
||||
'\\u0141' : "L", # L/ (Pecus)
|
||||
'\\u0142' : "l", # l/ (Pecus)
|
||||
'\\u0143' : "N", # N' (Pecus)
|
||||
'\\u0144' : "n", # n' (Pecus)
|
||||
#"'xd3" : "O", # O' (Pecus)
|
||||
#"'xf3" : "o", # o' (Pecus)
|
||||
'\\u015a' : "S", # S' (Pecus)
|
||||
'\\u015b' : "s", # s' (Pecus)
|
||||
'\\u0179' : "Z", # Z' (Pecus)
|
||||
'\\u017a' : "z", # z' (Pecus)
|
||||
'\\u017b' : "Z", # Z. (Pecus)
|
||||
'\\u017c' : "z", # z. (Pecus)
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
log.init('radio')
|
||||
return
|
||||
|
||||
# Translate all
|
||||
# Translate all (Called by rss class)
|
||||
def all(self,text):
|
||||
s = self._convert2escape(text)
|
||||
s = self._escape(s)
|
||||
@@ -327,21 +396,29 @@ class Translate:
|
||||
def _convert2escape(self,text):
|
||||
s = repr(text)
|
||||
if s.__len__() > 2:
|
||||
s= s[1:-1] # Strip ' characters
|
||||
s = s.lstrip("'")
|
||||
#s= s[1:-1] # Strip ' characters
|
||||
s = s.lstrip('\'')
|
||||
s = s.rstrip('\'')
|
||||
return s
|
||||
|
||||
# Convert escaped characters (umlauts) to normal characters
|
||||
def escape(self,text):
|
||||
s = self._convert2escape(text)
|
||||
s = self._escape(s)
|
||||
s = s.lstrip('"')
|
||||
s = s.rstrip('"')
|
||||
return s
|
||||
|
||||
# Convert escaped characters (umlauts etc.) to normal characters
|
||||
def _escape(self,text):
|
||||
s = text
|
||||
|
||||
for code in self.codes:
|
||||
s = s.replace(code, self.codes[code])
|
||||
|
||||
for code in self.short_codes:
|
||||
s = s.replace(code, self.short_codes[code])
|
||||
|
||||
s = s.replace("'oC",'oC') # Degrees C fudge
|
||||
s = s.replace("'oF",'oF') # Degrees C fudge
|
||||
return s
|
||||
@@ -419,3 +496,22 @@ class Translate:
|
||||
return s
|
||||
|
||||
# End of class
|
||||
|
||||
# Test translate class
|
||||
if __name__ == '__main__':
|
||||
|
||||
translate = Translate()
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
text = sys.argv[1]
|
||||
else:
|
||||
text = 'æ Æ ø Ø å Å'
|
||||
print text
|
||||
s = translate._convert2escape(text)
|
||||
print s
|
||||
|
||||
# Complete text
|
||||
print translate.all(text)
|
||||
print
|
||||
sys.exit(0)
|
||||
# End of file
|
||||
|
||||
Reference in New Issue
Block a user