Package creoleparser :: Module core
[hide private]
[frames] | no frames]

Source Code for Module creoleparser.core

  1  # core.py 
  2  # 
  3  # Copyright (c) 2007 Stephen Day 
  4  # 
  5  # This module is part of Creoleparser and is released under 
  6  # the MIT License: http://www.opensource.org/licenses/mit-license.php 
  7  # 
  8   
  9  import re 
 10   
 11  import genshi.builder as bldr 
 12   
 13  __docformat__ = 'restructuredtext en' 
 14   
 15  escape_char = '~' 
 16  esc_neg_look = '(?<!' + re.escape(escape_char) + ')' 
 17  esc_to_remove = re.compile(''.join([r'(?<!',re.escape(escape_char),')',re.escape(escape_char),r'(?!([ \n]|$))'])) 
 18  place_holder_re = re.compile(r'<<<(-?\d+?)>>>') 
 19  max_blank_lines = 250 
 20   
21 -def fill_from_store(text,element_store):
22 frags = [] 23 mo = place_holder_re.search(text) 24 while mo: 25 if mo.start(): 26 frags.append(text[:mo.start()]) 27 frags.append(element_store.get(mo.group(1), 28 mo.group(1).join(['<<<','>>>']))) 29 if mo.end() < len(text): 30 text = text[mo.end():] 31 else: 32 break 33 mo = place_holder_re.search(text) 34 else: 35 frags.append(text) 36 return frags
37 38
39 -def fragmentize(text,wiki_elements, element_store,remove_escapes=True):
40 41 """Takes a string of wiki markup and outputs a list of genshi 42 Fragments (Elements and strings). 43 44 This recursive function, with help from the WikiElement objects, 45 does almost all the parsing. 46 47 When no WikiElement objects are supplied, escapes are removed from 48 ``text`` (except if remove_escapes=True) and it is 49 returned as-is. This is the only way for recursion to stop. 50 51 :parameters: 52 text 53 the text to be parsed 54 wiki_elements 55 list of WikiElement objects to be searched for 56 remove_escapes 57 If False, escapes will not be removed 58 59 """ 60 61 while wiki_elements: 62 # If the first supplied wiki_element is actually a list of elements, \ 63 # search for all of them and match the closest one only. 64 if isinstance(wiki_elements[0],(list,tuple)): 65 x = None 66 mo = None 67 for element in wiki_elements[0]: 68 m = element.regexp.search(text) 69 if m: 70 if x is None: 71 x,wiki_element,mo = m.start(),element,m 72 elif m.start() < x: 73 x,wiki_element,mo = m.start(),element,m 74 else: 75 wiki_element = wiki_elements[0] 76 mo = wiki_element.regexp.search(text) 77 78 if mo: 79 frags = wiki_element._process(mo, text, wiki_elements, element_store) 80 break 81 else: 82 wiki_elements = wiki_elements[1:] 83 84 # remove escape characters 85 else: 86 if remove_escapes: 87 text = esc_to_remove.sub('',text) 88 frags = fill_from_store(text,element_store) 89 90 return frags
91 92
93 -class Parser(object):
94 95 """Instantiates a parser with specified behaviour""" 96
97 - def __init__(self,dialect, method='xhtml', strip_whitespace=False, encoding='utf-8'):
98 """Constructor for Parser objects. 99 100 :parameters: 101 dialect 102 A Creole instance 103 method 104 This value is passed to genshies Steam.render(). Possible values 105 include ``xhtml``, ``html``, and ``xml``. 106 strip_whitespace 107 This value is passed Genshies Steam.render(). 108 encoding 109 This value is passed Genshies Steam.render(). 110 """ 111 self.dialect = dialect 112 self.method = method 113 self.strip_whitespace = strip_whitespace 114 self.encoding=encoding
115
116 - def generate(self,text,element_store=None,context='block'):
117 """Returns a Genshi Stream. 118 119 :parameters: 120 text 121 The text to be parsed. 122 context 123 This is useful for marco development where (for example) supression 124 of paragraph tags is desired. Can be 'inline', 'block', or a list 125 of WikiElement objects (use with caution). 126 element_store 127 Internal dictionary that's passed around a lot ;) 128 129 See Genshi documentation for additional keyword arguments. 130 131 """ 132 if element_store is None: 133 element_store = {} 134 if not isinstance(context,list): 135 if context == 'block': 136 top_level_elements = self.dialect.block_elements 137 do_preprocess = True 138 elif context == 'inline': 139 top_level_elements = self.dialect.inline_elements 140 do_preprocess = False 141 else: 142 top_level_elements = context 143 do_preprocess = False 144 145 if do_preprocess: 146 chunks = preprocess(text,self.dialect) 147 else: 148 chunks = [text] 149 150 return bldr.tag(*[fragmentize(text,top_level_elements,element_store) for text in chunks]).generate()
151
152 - def render(self,text,element_store=None,context='block',**kwargs):
153 """Returns final output string (e.g., xhtml) 154 155 See generate() (above) and Genshi documentation for keyword arguments. 156 """ 157 if element_store is None: 158 element_store = {} 159 return self.generate(text,element_store,context).render(method=self.method,strip_whitespace=self.strip_whitespace, 160 encoding=self.encoding,**kwargs)
161
162 - def __call__(self,text,element_store=None,context='block'):
163 """Wrapper for the render method. Returns final output string. 164 165 See generate() (above) and Genshi documentation for keyword arguments. 166 """ 167 168 if element_store is None: 169 element_store = {} 170 return self.render(text,element_store,context)
171 172
173 -def preprocess(text, dialect):
174 """This should generally be called before fragmentize(). 175 176 :parameters: 177 text 178 text to be processsed. 179 dialect 180 a ``Creole`` object. 181 """ 182 text = text.replace("\r\n", "\n") 183 text = text.replace("\r", "\n") 184 text = ''.join([text.rstrip(),'\n']) 185 blank_lines = list(dialect.blank_line.regexp.finditer(text)) 186 if len(blank_lines) > max_blank_lines: 187 return chunk(text,blank_lines,[dialect.pre,dialect.bodied_block_macro],max_blank_lines) 188 189 return [text]
190 191
192 -def chunk(text, blank_lines, hard_elements, limit):
193 """Safely breaks large Creole documents into a list of smaller 194 ones (strings) 195 """ 196 hard_spans = [] 197 for e in hard_elements: 198 for mo in e.regexp.finditer(text): 199 hard_spans.append(mo.span()) 200 201 hard_chars = [] 202 for x,y in hard_spans: 203 hard_chars.extend(range(x,y)) 204 hard_chars = set(hard_chars) 205 206 chunks = [] 207 start = 0 208 for i in range(len(blank_lines)/limit): 209 for mo in blank_lines[limit/2 + i*limit:limit*3/2+i*limit:10]: 210 if mo.start() not in hard_chars: 211 chunks.append(text[start:mo.start()]) 212 start = mo.end() 213 break 214 chunks.append(text[start:]) 215 216 return chunks
217 218 219
220 -def _test():
221 import doctest 222 doctest.testmod()
223 224 if __name__ == "__main__": 225 _test() 226