Commit f97f4db2002913e1a18d17d7988ff27e00b7b9c1
1 parent
4c12b068
Exists in
master
and in
7 other branches
--no commit message
Showing
12 changed files
with
0 additions
and
3250 deletions
Show diff stats
pacotes/openlayers/tools/BeautifulSoup.py
... | ... | @@ -1,1767 +0,0 @@ |
1 | -"""Beautiful Soup | |
2 | -Elixir and Tonic | |
3 | -"The Screen-Scraper's Friend" | |
4 | -http://www.crummy.com/software/BeautifulSoup/ | |
5 | - | |
6 | -Beautiful Soup parses a (possibly invalid) XML or HTML document into a | |
7 | -tree representation. It provides methods and Pythonic idioms that make | |
8 | -it easy to navigate, search, and modify the tree. | |
9 | - | |
10 | -A well-formed XML/HTML document yields a well-formed data | |
11 | -structure. An ill-formed XML/HTML document yields a correspondingly | |
12 | -ill-formed data structure. If your document is only locally | |
13 | -well-formed, you can use this library to find and process the | |
14 | -well-formed part of it. The BeautifulSoup class | |
15 | - | |
16 | -Beautiful Soup works with Python 2.2 and up. It has no external | |
17 | -dependencies, but you'll have more success at converting data to UTF-8 | |
18 | -if you also install these three packages: | |
19 | - | |
20 | -* chardet, for auto-detecting character encodings | |
21 | - http://chardet.feedparser.org/ | |
22 | -* cjkcodecs and iconv_codec, which add more encodings to the ones supported | |
23 | - by stock Python. | |
24 | - http://cjkpython.i18n.org/ | |
25 | - | |
26 | -Beautiful Soup defines classes for two main parsing strategies: | |
27 | - | |
28 | - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific | |
29 | - language that kind of looks like XML. | |
30 | - | |
31 | - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid | |
32 | - or invalid. This class has web browser-like heuristics for | |
33 | - obtaining a sensible parse tree in the face of common HTML errors. | |
34 | - | |
35 | -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting | |
36 | -the encoding of an HTML or XML document, and converting it to | |
37 | -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. | |
38 | - | |
39 | -For more than you ever wanted to know about Beautiful Soup, see the | |
40 | -documentation: | |
41 | -http://www.crummy.com/software/BeautifulSoup/documentation.html | |
42 | - | |
43 | -""" | |
44 | -from __future__ import generators | |
45 | - | |
46 | -__author__ = "Leonard Richardson (leonardr@segfault.org)" | |
47 | -__version__ = "3.0.4" | |
48 | -__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" | |
49 | -__license__ = "PSF" | |
50 | - | |
51 | -from sgmllib import SGMLParser, SGMLParseError | |
52 | -import codecs | |
53 | -import types | |
54 | -import re | |
55 | -import sgmllib | |
56 | -try: | |
57 | - from htmlentitydefs import name2codepoint | |
58 | -except ImportError: | |
59 | - name2codepoint = {} | |
60 | - | |
61 | -#This hack makes Beautiful Soup able to parse XML with namespaces | |
62 | -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') | |
63 | - | |
64 | -DEFAULT_OUTPUT_ENCODING = "utf-8" | |
65 | - | |
66 | -# First, the classes that represent markup elements. | |
67 | - | |
68 | -class PageElement: | |
69 | - """Contains the navigational information for some part of the page | |
70 | - (either a tag or a piece of text)""" | |
71 | - | |
72 | - def setup(self, parent=None, previous=None): | |
73 | - """Sets up the initial relations between this element and | |
74 | - other elements.""" | |
75 | - self.parent = parent | |
76 | - self.previous = previous | |
77 | - self.next = None | |
78 | - self.previousSibling = None | |
79 | - self.nextSibling = None | |
80 | - if self.parent and self.parent.contents: | |
81 | - self.previousSibling = self.parent.contents[-1] | |
82 | - self.previousSibling.nextSibling = self | |
83 | - | |
84 | - def replaceWith(self, replaceWith): | |
85 | - oldParent = self.parent | |
86 | - myIndex = self.parent.contents.index(self) | |
87 | - if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: | |
88 | - # We're replacing this element with one of its siblings. | |
89 | - index = self.parent.contents.index(replaceWith) | |
90 | - if index and index < myIndex: | |
91 | - # Furthermore, it comes before this element. That | |
92 | - # means that when we extract it, the index of this | |
93 | - # element will change. | |
94 | - myIndex = myIndex - 1 | |
95 | - self.extract() | |
96 | - oldParent.insert(myIndex, replaceWith) | |
97 | - | |
98 | - def extract(self): | |
99 | - """Destructively rips this element out of the tree.""" | |
100 | - if self.parent: | |
101 | - try: | |
102 | - self.parent.contents.remove(self) | |
103 | - except ValueError: | |
104 | - pass | |
105 | - | |
106 | - #Find the two elements that would be next to each other if | |
107 | - #this element (and any children) hadn't been parsed. Connect | |
108 | - #the two. | |
109 | - lastChild = self._lastRecursiveChild() | |
110 | - nextElement = lastChild.next | |
111 | - | |
112 | - if self.previous: | |
113 | - self.previous.next = nextElement | |
114 | - if nextElement: | |
115 | - nextElement.previous = self.previous | |
116 | - self.previous = None | |
117 | - lastChild.next = None | |
118 | - | |
119 | - self.parent = None | |
120 | - if self.previousSibling: | |
121 | - self.previousSibling.nextSibling = self.nextSibling | |
122 | - if self.nextSibling: | |
123 | - self.nextSibling.previousSibling = self.previousSibling | |
124 | - self.previousSibling = self.nextSibling = None | |
125 | - | |
126 | - def _lastRecursiveChild(self): | |
127 | - "Finds the last element beneath this object to be parsed." | |
128 | - lastChild = self | |
129 | - while hasattr(lastChild, 'contents') and lastChild.contents: | |
130 | - lastChild = lastChild.contents[-1] | |
131 | - return lastChild | |
132 | - | |
133 | - def insert(self, position, newChild): | |
134 | - if (isinstance(newChild, basestring) | |
135 | - or isinstance(newChild, unicode)) \ | |
136 | - and not isinstance(newChild, NavigableString): | |
137 | - newChild = NavigableString(newChild) | |
138 | - | |
139 | - position = min(position, len(self.contents)) | |
140 | - if hasattr(newChild, 'parent') and newChild.parent != None: | |
141 | - # We're 'inserting' an element that's already one | |
142 | - # of this object's children. | |
143 | - if newChild.parent == self: | |
144 | - index = self.find(newChild) | |
145 | - if index and index < position: | |
146 | - # Furthermore we're moving it further down the | |
147 | - # list of this object's children. That means that | |
148 | - # when we extract this element, our target index | |
149 | - # will jump down one. | |
150 | - position = position - 1 | |
151 | - newChild.extract() | |
152 | - | |
153 | - newChild.parent = self | |
154 | - previousChild = None | |
155 | - if position == 0: | |
156 | - newChild.previousSibling = None | |
157 | - newChild.previous = self | |
158 | - else: | |
159 | - previousChild = self.contents[position-1] | |
160 | - newChild.previousSibling = previousChild | |
161 | - newChild.previousSibling.nextSibling = newChild | |
162 | - newChild.previous = previousChild._lastRecursiveChild() | |
163 | - if newChild.previous: | |
164 | - newChild.previous.next = newChild | |
165 | - | |
166 | - newChildsLastElement = newChild._lastRecursiveChild() | |
167 | - | |
168 | - if position >= len(self.contents): | |
169 | - newChild.nextSibling = None | |
170 | - | |
171 | - parent = self | |
172 | - parentsNextSibling = None | |
173 | - while not parentsNextSibling: | |
174 | - parentsNextSibling = parent.nextSibling | |
175 | - parent = parent.parent | |
176 | - if not parent: # This is the last element in the document. | |
177 | - break | |
178 | - if parentsNextSibling: | |
179 | - newChildsLastElement.next = parentsNextSibling | |
180 | - else: | |
181 | - newChildsLastElement.next = None | |
182 | - else: | |
183 | - nextChild = self.contents[position] | |
184 | - newChild.nextSibling = nextChild | |
185 | - if newChild.nextSibling: | |
186 | - newChild.nextSibling.previousSibling = newChild | |
187 | - newChildsLastElement.next = nextChild | |
188 | - | |
189 | - if newChildsLastElement.next: | |
190 | - newChildsLastElement.next.previous = newChildsLastElement | |
191 | - self.contents.insert(position, newChild) | |
192 | - | |
193 | - def findNext(self, name=None, attrs={}, text=None, **kwargs): | |
194 | - """Returns the first item that matches the given criteria and | |
195 | - appears after this Tag in the document.""" | |
196 | - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) | |
197 | - | |
198 | - def findAllNext(self, name=None, attrs={}, text=None, limit=None, | |
199 | - **kwargs): | |
200 | - """Returns all items that match the given criteria and appear | |
201 | - before after Tag in the document.""" | |
202 | - return self._findAll(name, attrs, text, limit, self.nextGenerator) | |
203 | - | |
204 | - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): | |
205 | - """Returns the closest sibling to this Tag that matches the | |
206 | - given criteria and appears after this Tag in the document.""" | |
207 | - return self._findOne(self.findNextSiblings, name, attrs, text, | |
208 | - **kwargs) | |
209 | - | |
210 | - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, | |
211 | - **kwargs): | |
212 | - """Returns the siblings of this Tag that match the given | |
213 | - criteria and appear after this Tag in the document.""" | |
214 | - return self._findAll(name, attrs, text, limit, | |
215 | - self.nextSiblingGenerator, **kwargs) | |
216 | - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x | |
217 | - | |
218 | - def findPrevious(self, name=None, attrs={}, text=None, **kwargs): | |
219 | - """Returns the first item that matches the given criteria and | |
220 | - appears before this Tag in the document.""" | |
221 | - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) | |
222 | - | |
223 | - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, | |
224 | - **kwargs): | |
225 | - """Returns all items that match the given criteria and appear | |
226 | - before this Tag in the document.""" | |
227 | - return self._findAll(name, attrs, text, limit, self.previousGenerator, | |
228 | - **kwargs) | |
229 | - fetchPrevious = findAllPrevious # Compatibility with pre-3.x | |
230 | - | |
231 | - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): | |
232 | - """Returns the closest sibling to this Tag that matches the | |
233 | - given criteria and appears before this Tag in the document.""" | |
234 | - return self._findOne(self.findPreviousSiblings, name, attrs, text, | |
235 | - **kwargs) | |
236 | - | |
237 | - def findPreviousSiblings(self, name=None, attrs={}, text=None, | |
238 | - limit=None, **kwargs): | |
239 | - """Returns the siblings of this Tag that match the given | |
240 | - criteria and appear before this Tag in the document.""" | |
241 | - return self._findAll(name, attrs, text, limit, | |
242 | - self.previousSiblingGenerator, **kwargs) | |
243 | - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x | |
244 | - | |
245 | - def findParent(self, name=None, attrs={}, **kwargs): | |
246 | - """Returns the closest parent of this Tag that matches the given | |
247 | - criteria.""" | |
248 | - # NOTE: We can't use _findOne because findParents takes a different | |
249 | - # set of arguments. | |
250 | - r = None | |
251 | - l = self.findParents(name, attrs, 1) | |
252 | - if l: | |
253 | - r = l[0] | |
254 | - return r | |
255 | - | |
256 | - def findParents(self, name=None, attrs={}, limit=None, **kwargs): | |
257 | - """Returns the parents of this Tag that match the given | |
258 | - criteria.""" | |
259 | - | |
260 | - return self._findAll(name, attrs, None, limit, self.parentGenerator, | |
261 | - **kwargs) | |
262 | - fetchParents = findParents # Compatibility with pre-3.x | |
263 | - | |
264 | - #These methods do the real heavy lifting. | |
265 | - | |
266 | - def _findOne(self, method, name, attrs, text, **kwargs): | |
267 | - r = None | |
268 | - l = method(name, attrs, text, 1, **kwargs) | |
269 | - if l: | |
270 | - r = l[0] | |
271 | - return r | |
272 | - | |
273 | - def _findAll(self, name, attrs, text, limit, generator, **kwargs): | |
274 | - "Iterates over a generator looking for things that match." | |
275 | - | |
276 | - if isinstance(name, SoupStrainer): | |
277 | - strainer = name | |
278 | - else: | |
279 | - # Build a SoupStrainer | |
280 | - strainer = SoupStrainer(name, attrs, text, **kwargs) | |
281 | - results = ResultSet(strainer) | |
282 | - g = generator() | |
283 | - while True: | |
284 | - try: | |
285 | - i = g.next() | |
286 | - except StopIteration: | |
287 | - break | |
288 | - if i: | |
289 | - found = strainer.search(i) | |
290 | - if found: | |
291 | - results.append(found) | |
292 | - if limit and len(results) >= limit: | |
293 | - break | |
294 | - return results | |
295 | - | |
296 | - #These Generators can be used to navigate starting from both | |
297 | - #NavigableStrings and Tags. | |
298 | - def nextGenerator(self): | |
299 | - i = self | |
300 | - while i: | |
301 | - i = i.next | |
302 | - yield i | |
303 | - | |
304 | - def nextSiblingGenerator(self): | |
305 | - i = self | |
306 | - while i: | |
307 | - i = i.nextSibling | |
308 | - yield i | |
309 | - | |
310 | - def previousGenerator(self): | |
311 | - i = self | |
312 | - while i: | |
313 | - i = i.previous | |
314 | - yield i | |
315 | - | |
316 | - def previousSiblingGenerator(self): | |
317 | - i = self | |
318 | - while i: | |
319 | - i = i.previousSibling | |
320 | - yield i | |
321 | - | |
322 | - def parentGenerator(self): | |
323 | - i = self | |
324 | - while i: | |
325 | - i = i.parent | |
326 | - yield i | |
327 | - | |
328 | - # Utility methods | |
329 | - def substituteEncoding(self, str, encoding=None): | |
330 | - encoding = encoding or "utf-8" | |
331 | - return str.replace("%SOUP-ENCODING%", encoding) | |
332 | - | |
333 | - def toEncoding(self, s, encoding=None): | |
334 | - """Encodes an object to a string in some encoding, or to Unicode. | |
335 | - .""" | |
336 | - if isinstance(s, unicode): | |
337 | - if encoding: | |
338 | - s = s.encode(encoding) | |
339 | - elif isinstance(s, str): | |
340 | - if encoding: | |
341 | - s = s.encode(encoding) | |
342 | - else: | |
343 | - s = unicode(s) | |
344 | - else: | |
345 | - if encoding: | |
346 | - s = self.toEncoding(str(s), encoding) | |
347 | - else: | |
348 | - s = unicode(s) | |
349 | - return s | |
350 | - | |
351 | -class NavigableString(unicode, PageElement): | |
352 | - | |
353 | - def __getattr__(self, attr): | |
354 | - """text.string gives you text. This is for backwards | |
355 | - compatibility for Navigable*String, but for CData* it lets you | |
356 | - get the string without the CData wrapper.""" | |
357 | - if attr == 'string': | |
358 | - return self | |
359 | - else: | |
360 | - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) | |
361 | - | |
362 | - def __unicode__(self): | |
363 | - return self.__str__(None) | |
364 | - | |
365 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
366 | - if encoding: | |
367 | - return self.encode(encoding) | |
368 | - else: | |
369 | - return self | |
370 | - | |
371 | -class CData(NavigableString): | |
372 | - | |
373 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
374 | - return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) | |
375 | - | |
376 | -class ProcessingInstruction(NavigableString): | |
377 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
378 | - output = self | |
379 | - if "%SOUP-ENCODING%" in output: | |
380 | - output = self.substituteEncoding(output, encoding) | |
381 | - return "<?%s?>" % self.toEncoding(output, encoding) | |
382 | - | |
383 | -class Comment(NavigableString): | |
384 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
385 | - return "<!--%s-->" % NavigableString.__str__(self, encoding) | |
386 | - | |
387 | -class Declaration(NavigableString): | |
388 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
389 | - return "<!%s>" % NavigableString.__str__(self, encoding) | |
390 | - | |
391 | -class Tag(PageElement): | |
392 | - | |
393 | - """Represents a found HTML tag with its attributes and contents.""" | |
394 | - | |
395 | - XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", | |
396 | - '"' : "quote", | |
397 | - "&" : "amp", | |
398 | - "<" : "lt", | |
399 | - ">" : "gt" } | |
400 | - | |
401 | - def __init__(self, parser, name, attrs=None, parent=None, | |
402 | - previous=None): | |
403 | - "Basic constructor." | |
404 | - | |
405 | - # We don't actually store the parser object: that lets extracted | |
406 | - # chunks be garbage-collected | |
407 | - self.parserClass = parser.__class__ | |
408 | - self.isSelfClosing = parser.isSelfClosingTag(name) | |
409 | - self.name = name | |
410 | - if attrs == None: | |
411 | - attrs = [] | |
412 | - self.attrs = attrs | |
413 | - self.contents = [] | |
414 | - self.setup(parent, previous) | |
415 | - self.hidden = False | |
416 | - self.containsSubstitutions = False | |
417 | - | |
418 | - def get(self, key, default=None): | |
419 | - """Returns the value of the 'key' attribute for the tag, or | |
420 | - the value given for 'default' if it doesn't have that | |
421 | - attribute.""" | |
422 | - return self._getAttrMap().get(key, default) | |
423 | - | |
424 | - def has_key(self, key): | |
425 | - return self._getAttrMap().has_key(key) | |
426 | - | |
427 | - def __getitem__(self, key): | |
428 | - """tag[key] returns the value of the 'key' attribute for the tag, | |
429 | - and throws an exception if it's not there.""" | |
430 | - return self._getAttrMap()[key] | |
431 | - | |
432 | - def __iter__(self): | |
433 | - "Iterating over a tag iterates over its contents." | |
434 | - return iter(self.contents) | |
435 | - | |
436 | - def __len__(self): | |
437 | - "The length of a tag is the length of its list of contents." | |
438 | - return len(self.contents) | |
439 | - | |
440 | - def __contains__(self, x): | |
441 | - return x in self.contents | |
442 | - | |
443 | - def __nonzero__(self): | |
444 | - "A tag is non-None even if it has no contents." | |
445 | - return True | |
446 | - | |
447 | - def __setitem__(self, key, value): | |
448 | - """Setting tag[key] sets the value of the 'key' attribute for the | |
449 | - tag.""" | |
450 | - self._getAttrMap() | |
451 | - self.attrMap[key] = value | |
452 | - found = False | |
453 | - for i in range(0, len(self.attrs)): | |
454 | - if self.attrs[i][0] == key: | |
455 | - self.attrs[i] = (key, value) | |
456 | - found = True | |
457 | - if not found: | |
458 | - self.attrs.append((key, value)) | |
459 | - self._getAttrMap()[key] = value | |
460 | - | |
461 | - def __delitem__(self, key): | |
462 | - "Deleting tag[key] deletes all 'key' attributes for the tag." | |
463 | - for item in self.attrs: | |
464 | - if item[0] == key: | |
465 | - self.attrs.remove(item) | |
466 | - #We don't break because bad HTML can define the same | |
467 | - #attribute multiple times. | |
468 | - self._getAttrMap() | |
469 | - if self.attrMap.has_key(key): | |
470 | - del self.attrMap[key] | |
471 | - | |
472 | - def __call__(self, *args, **kwargs): | |
473 | - """Calling a tag like a function is the same as calling its | |
474 | - findAll() method. Eg. tag('a') returns a list of all the A tags | |
475 | - found within this tag.""" | |
476 | - return apply(self.findAll, args, kwargs) | |
477 | - | |
478 | - def __getattr__(self, tag): | |
479 | - #print "Getattr %s.%s" % (self.__class__, tag) | |
480 | - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: | |
481 | - return self.find(tag[:-3]) | |
482 | - elif tag.find('__') != 0: | |
483 | - return self.find(tag) | |
484 | - | |
485 | - def __eq__(self, other): | |
486 | - """Returns true iff this tag has the same name, the same attributes, | |
487 | - and the same contents (recursively) as the given tag. | |
488 | - | |
489 | - NOTE: right now this will return false if two tags have the | |
490 | - same attributes in a different order. Should this be fixed?""" | |
491 | - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): | |
492 | - return False | |
493 | - for i in range(0, len(self.contents)): | |
494 | - if self.contents[i] != other.contents[i]: | |
495 | - return False | |
496 | - return True | |
497 | - | |
498 | - def __ne__(self, other): | |
499 | - """Returns true iff this tag is not identical to the other tag, | |
500 | - as defined in __eq__.""" | |
501 | - return not self == other | |
502 | - | |
503 | - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
504 | - """Renders this tag as a string.""" | |
505 | - return self.__str__(encoding) | |
506 | - | |
507 | - def __unicode__(self): | |
508 | - return self.__str__(None) | |
509 | - | |
510 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
511 | - prettyPrint=False, indentLevel=0): | |
512 | - """Returns a string or Unicode representation of this tag and | |
513 | - its contents. To get Unicode, pass None for encoding. | |
514 | - | |
515 | - NOTE: since Python's HTML parser consumes whitespace, this | |
516 | - method is not certain to reproduce the whitespace present in | |
517 | - the original string.""" | |
518 | - | |
519 | - encodedName = self.toEncoding(self.name, encoding) | |
520 | - | |
521 | - attrs = [] | |
522 | - if self.attrs: | |
523 | - for key, val in self.attrs: | |
524 | - fmt = '%s="%s"' | |
525 | - if isString(val): | |
526 | - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: | |
527 | - val = self.substituteEncoding(val, encoding) | |
528 | - | |
529 | - # The attribute value either: | |
530 | - # | |
531 | - # * Contains no embedded double quotes or single quotes. | |
532 | - # No problem: we enclose it in double quotes. | |
533 | - # * Contains embedded single quotes. No problem: | |
534 | - # double quotes work here too. | |
535 | - # * Contains embedded double quotes. No problem: | |
536 | - # we enclose it in single quotes. | |
537 | - # * Embeds both single _and_ double quotes. This | |
538 | - # can't happen naturally, but it can happen if | |
539 | - # you modify an attribute value after parsing | |
540 | - # the document. Now we have a bit of a | |
541 | - # problem. We solve it by enclosing the | |
542 | - # attribute in single quotes, and escaping any | |
543 | - # embedded single quotes to XML entities. | |
544 | - if '"' in val: | |
545 | - fmt = "%s='%s'" | |
546 | - # This can't happen naturally, but it can happen | |
547 | - # if you modify an attribute value after parsing. | |
548 | - if "'" in val: | |
549 | - val = val.replace("'", "&squot;") | |
550 | - | |
551 | - # Now we're okay w/r/t quotes. But the attribute | |
552 | - # value might also contain angle brackets, or | |
553 | - # ampersands that aren't part of entities. We need | |
554 | - # to escape those to XML entities too. | |
555 | - val = re.sub("([<>]|&(?![^\s]+;))", | |
556 | - lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", | |
557 | - val) | |
558 | - | |
559 | - attrs.append(fmt % (self.toEncoding(key, encoding), | |
560 | - self.toEncoding(val, encoding))) | |
561 | - close = '' | |
562 | - closeTag = '' | |
563 | - if self.isSelfClosing: | |
564 | - close = ' /' | |
565 | - else: | |
566 | - closeTag = '</%s>' % encodedName | |
567 | - | |
568 | - indentTag, indentContents = 0, 0 | |
569 | - if prettyPrint: | |
570 | - indentTag = indentLevel | |
571 | - space = (' ' * (indentTag-1)) | |
572 | - indentContents = indentTag + 1 | |
573 | - contents = self.renderContents(encoding, prettyPrint, indentContents) | |
574 | - if self.hidden: | |
575 | - s = contents | |
576 | - else: | |
577 | - s = [] | |
578 | - attributeString = '' | |
579 | - if attrs: | |
580 | - attributeString = ' ' + ' '.join(attrs) | |
581 | - if prettyPrint: | |
582 | - s.append(space) | |
583 | - s.append('<%s%s%s>' % (encodedName, attributeString, close)) | |
584 | - if prettyPrint: | |
585 | - s.append("\n") | |
586 | - s.append(contents) | |
587 | - if prettyPrint and contents and contents[-1] != "\n": | |
588 | - s.append("\n") | |
589 | - if prettyPrint and closeTag: | |
590 | - s.append(space) | |
591 | - s.append(closeTag) | |
592 | - if prettyPrint and closeTag and self.nextSibling: | |
593 | - s.append("\n") | |
594 | - s = ''.join(s) | |
595 | - return s | |
596 | - | |
597 | - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
598 | - return self.__str__(encoding, True) | |
599 | - | |
600 | - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
601 | - prettyPrint=False, indentLevel=0): | |
602 | - """Renders the contents of this tag as a string in the given | |
603 | - encoding. If encoding is None, returns a Unicode string..""" | |
604 | - s=[] | |
605 | - for c in self: | |
606 | - text = None | |
607 | - if isinstance(c, NavigableString): | |
608 | - text = c.__str__(encoding) | |
609 | - elif isinstance(c, Tag): | |
610 | - s.append(c.__str__(encoding, prettyPrint, indentLevel)) | |
611 | - if text and prettyPrint: | |
612 | - text = text.strip() | |
613 | - if text: | |
614 | - if prettyPrint: | |
615 | - s.append(" " * (indentLevel-1)) | |
616 | - s.append(text) | |
617 | - if prettyPrint: | |
618 | - s.append("\n") | |
619 | - return ''.join(s) | |
620 | - | |
621 | - #Soup methods | |
622 | - | |
623 | - def find(self, name=None, attrs={}, recursive=True, text=None, | |
624 | - **kwargs): | |
625 | - """Return only the first child of this Tag matching the given | |
626 | - criteria.""" | |
627 | - r = None | |
628 | - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) | |
629 | - if l: | |
630 | - r = l[0] | |
631 | - return r | |
632 | - findChild = find | |
633 | - | |
634 | - def findAll(self, name=None, attrs={}, recursive=True, text=None, | |
635 | - limit=None, **kwargs): | |
636 | - """Extracts a list of Tag objects that match the given | |
637 | - criteria. You can specify the name of the Tag and any | |
638 | - attributes you want the Tag to have. | |
639 | - | |
640 | - The value of a key-value pair in the 'attrs' map can be a | |
641 | - string, a list of strings, a regular expression object, or a | |
642 | - callable that takes a string and returns whether or not the | |
643 | - string matches for some custom definition of 'matches'. The | |
644 | - same is true of the tag name.""" | |
645 | - generator = self.recursiveChildGenerator | |
646 | - if not recursive: | |
647 | - generator = self.childGenerator | |
648 | - return self._findAll(name, attrs, text, limit, generator, **kwargs) | |
649 | - findChildren = findAll | |
650 | - | |
651 | - # Pre-3.x compatibility methods | |
652 | - first = find | |
653 | - fetch = findAll | |
654 | - | |
655 | - def fetchText(self, text=None, recursive=True, limit=None): | |
656 | - return self.findAll(text=text, recursive=recursive, limit=limit) | |
657 | - | |
658 | - def firstText(self, text=None, recursive=True): | |
659 | - return self.find(text=text, recursive=recursive) | |
660 | - | |
661 | - #Utility methods | |
662 | - | |
663 | - def append(self, tag): | |
664 | - """Appends the given tag to the contents of this tag.""" | |
665 | - self.contents.append(tag) | |
666 | - | |
667 | - #Private methods | |
668 | - | |
669 | - def _getAttrMap(self): | |
670 | - """Initializes a map representation of this tag's attributes, | |
671 | - if not already initialized.""" | |
672 | - if not getattr(self, 'attrMap'): | |
673 | - self.attrMap = {} | |
674 | - for (key, value) in self.attrs: | |
675 | - self.attrMap[key] = value | |
676 | - return self.attrMap | |
677 | - | |
678 | - #Generator methods | |
679 | - def childGenerator(self): | |
680 | - for i in range(0, len(self.contents)): | |
681 | - yield self.contents[i] | |
682 | - raise StopIteration | |
683 | - | |
684 | - def recursiveChildGenerator(self): | |
685 | - stack = [(self, 0)] | |
686 | - while stack: | |
687 | - tag, start = stack.pop() | |
688 | - if isinstance(tag, Tag): | |
689 | - for i in range(start, len(tag.contents)): | |
690 | - a = tag.contents[i] | |
691 | - yield a | |
692 | - if isinstance(a, Tag) and tag.contents: | |
693 | - if i < len(tag.contents) - 1: | |
694 | - stack.append((tag, i+1)) | |
695 | - stack.append((a, 0)) | |
696 | - break | |
697 | - raise StopIteration | |
698 | - | |
699 | -# Next, a couple classes to represent queries and their results. | |
700 | -class SoupStrainer: | |
701 | - """Encapsulates a number of ways of matching a markup element (tag or | |
702 | - text).""" | |
703 | - | |
704 | - def __init__(self, name=None, attrs={}, text=None, **kwargs): | |
705 | - self.name = name | |
706 | - if isString(attrs): | |
707 | - kwargs['class'] = attrs | |
708 | - attrs = None | |
709 | - if kwargs: | |
710 | - if attrs: | |
711 | - attrs = attrs.copy() | |
712 | - attrs.update(kwargs) | |
713 | - else: | |
714 | - attrs = kwargs | |
715 | - self.attrs = attrs | |
716 | - self.text = text | |
717 | - | |
718 | - def __str__(self): | |
719 | - if self.text: | |
720 | - return self.text | |
721 | - else: | |
722 | - return "%s|%s" % (self.name, self.attrs) | |
723 | - | |
724 | - def searchTag(self, markupName=None, markupAttrs={}): | |
725 | - found = None | |
726 | - markup = None | |
727 | - if isinstance(markupName, Tag): | |
728 | - markup = markupName | |
729 | - markupAttrs = markup | |
730 | - callFunctionWithTagData = callable(self.name) \ | |
731 | - and not isinstance(markupName, Tag) | |
732 | - | |
733 | - if (not self.name) \ | |
734 | - or callFunctionWithTagData \ | |
735 | - or (markup and self._matches(markup, self.name)) \ | |
736 | - or (not markup and self._matches(markupName, self.name)): | |
737 | - if callFunctionWithTagData: | |
738 | - match = self.name(markupName, markupAttrs) | |
739 | - else: | |
740 | - match = True | |
741 | - markupAttrMap = None | |
742 | - for attr, matchAgainst in self.attrs.items(): | |
743 | - if not markupAttrMap: | |
744 | - if hasattr(markupAttrs, 'get'): | |
745 | - markupAttrMap = markupAttrs | |
746 | - else: | |
747 | - markupAttrMap = {} | |
748 | - for k,v in markupAttrs: | |
749 | - markupAttrMap[k] = v | |
750 | - attrValue = markupAttrMap.get(attr) | |
751 | - if not self._matches(attrValue, matchAgainst): | |
752 | - match = False | |
753 | - break | |
754 | - if match: | |
755 | - if markup: | |
756 | - found = markup | |
757 | - else: | |
758 | - found = markupName | |
759 | - return found | |
760 | - | |
761 | - def search(self, markup): | |
762 | - #print 'looking for %s in %s' % (self, markup) | |
763 | - found = None | |
764 | - # If given a list of items, scan it for a text element that | |
765 | - # matches. | |
766 | - if isList(markup) and not isinstance(markup, Tag): | |
767 | - for element in markup: | |
768 | - if isinstance(element, NavigableString) \ | |
769 | - and self.search(element): | |
770 | - found = element | |
771 | - break | |
772 | - # If it's a Tag, make sure its name or attributes match. | |
773 | - # Don't bother with Tags if we're searching for text. | |
774 | - elif isinstance(markup, Tag): | |
775 | - if not self.text: | |
776 | - found = self.searchTag(markup) | |
777 | - # If it's text, make sure the text matches. | |
778 | - elif isinstance(markup, NavigableString) or \ | |
779 | - isString(markup): | |
780 | - if self._matches(markup, self.text): | |
781 | - found = markup | |
782 | - else: | |
783 | - raise Exception, "I don't know how to match against a %s" \ | |
784 | - % markup.__class__ | |
785 | - return found | |
786 | - | |
787 | - def _matches(self, markup, matchAgainst): | |
788 | - #print "Matching %s against %s" % (markup, matchAgainst) | |
789 | - result = False | |
790 | - if matchAgainst == True and type(matchAgainst) == types.BooleanType: | |
791 | - result = markup != None | |
792 | - elif callable(matchAgainst): | |
793 | - result = matchAgainst(markup) | |
794 | - else: | |
795 | - #Custom match methods take the tag as an argument, but all | |
796 | - #other ways of matching match the tag name as a string. | |
797 | - if isinstance(markup, Tag): | |
798 | - markup = markup.name | |
799 | - if markup and not isString(markup): | |
800 | - markup = unicode(markup) | |
801 | - #Now we know that chunk is either a string, or None. | |
802 | - if hasattr(matchAgainst, 'match'): | |
803 | - # It's a regexp object. | |
804 | - result = markup and matchAgainst.search(markup) | |
805 | - elif isList(matchAgainst): | |
806 | - result = markup in matchAgainst | |
807 | - elif hasattr(matchAgainst, 'items'): | |
808 | - result = markup.has_key(matchAgainst) | |
809 | - elif matchAgainst and isString(markup): | |
810 | - if isinstance(markup, unicode): | |
811 | - matchAgainst = unicode(matchAgainst) | |
812 | - else: | |
813 | - matchAgainst = str(matchAgainst) | |
814 | - | |
815 | - if not result: | |
816 | - result = matchAgainst == markup | |
817 | - return result | |
818 | - | |
819 | -class ResultSet(list): | |
820 | - """A ResultSet is just a list that keeps track of the SoupStrainer | |
821 | - that created it.""" | |
822 | - def __init__(self, source): | |
823 | - list.__init__([]) | |
824 | - self.source = source | |
825 | - | |
826 | -# Now, some helper functions. | |
827 | - | |
828 | -def isList(l): | |
829 | - """Convenience method that works with all 2.x versions of Python | |
830 | - to determine whether or not something is listlike.""" | |
831 | - return hasattr(l, '__iter__') \ | |
832 | - or (type(l) in (types.ListType, types.TupleType)) | |
833 | - | |
834 | -def isString(s): | |
835 | - """Convenience method that works with all 2.x versions of Python | |
836 | - to determine whether or not something is stringlike.""" | |
837 | - try: | |
838 | - return isinstance(s, unicode) or isintance(s, basestring) | |
839 | - except NameError: | |
840 | - return isinstance(s, str) | |
841 | - | |
842 | -def buildTagMap(default, *args): | |
843 | - """Turns a list of maps, lists, or scalars into a single map. | |
844 | - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and | |
845 | - NESTING_RESET_TAGS maps out of lists and partial maps.""" | |
846 | - built = {} | |
847 | - for portion in args: | |
848 | - if hasattr(portion, 'items'): | |
849 | - #It's a map. Merge it. | |
850 | - for k,v in portion.items(): | |
851 | - built[k] = v | |
852 | - elif isList(portion): | |
853 | - #It's a list. Map each item to the default. | |
854 | - for k in portion: | |
855 | - built[k] = default | |
856 | - else: | |
857 | - #It's a scalar. Map it to the default. | |
858 | - built[portion] = default | |
859 | - return built | |
860 | - | |
861 | -# Now, the parser classes. | |
862 | - | |
863 | -class BeautifulStoneSoup(Tag, SGMLParser): | |
864 | - | |
865 | - """This class contains the basic parser and search code. It defines | |
866 | - a parser that knows nothing about tag behavior except for the | |
867 | - following: | |
868 | - | |
869 | - You can't close a tag without closing all the tags it encloses. | |
870 | - That is, "<foo><bar></foo>" actually means | |
871 | - "<foo><bar></bar></foo>". | |
872 | - | |
873 | - [Another possible explanation is "<foo><bar /></foo>", but since | |
874 | - this class defines no SELF_CLOSING_TAGS, it will never use that | |
875 | - explanation.] | |
876 | - | |
877 | - This class is useful for parsing XML or made-up markup languages, | |
878 | - or when BeautifulSoup makes an assumption counter to what you were | |
879 | - expecting.""" | |
880 | - | |
881 | - XML_ENTITY_LIST = {} | |
882 | - for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): | |
883 | - XML_ENTITY_LIST[i] = True | |
884 | - | |
885 | - SELF_CLOSING_TAGS = {} | |
886 | - NESTABLE_TAGS = {} | |
887 | - RESET_NESTING_TAGS = {} | |
888 | - QUOTE_TAGS = {} | |
889 | - | |
890 | - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), | |
891 | - lambda x: x.group(1) + ' />'), | |
892 | - (re.compile('<!\s+([^<>]*)>'), | |
893 | - lambda x: '<!' + x.group(1) + '>') | |
894 | - ] | |
895 | - | |
896 | - ROOT_TAG_NAME = u'[document]' | |
897 | - | |
898 | - HTML_ENTITIES = "html" | |
899 | - XML_ENTITIES = "xml" | |
900 | - | |
901 | - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, | |
902 | - markupMassage=True, smartQuotesTo=XML_ENTITIES, | |
903 | - convertEntities=None, selfClosingTags=None): | |
904 | - """The Soup object is initialized as the 'root tag', and the | |
905 | - provided markup (which can be a string or a file-like object) | |
906 | - is fed into the underlying parser. | |
907 | - | |
908 | - sgmllib will process most bad HTML, and the BeautifulSoup | |
909 | - class has some tricks for dealing with some HTML that kills | |
910 | - sgmllib, but Beautiful Soup can nonetheless choke or lose data | |
911 | - if your data uses self-closing tags or declarations | |
912 | - incorrectly. | |
913 | - | |
914 | - By default, Beautiful Soup uses regexes to sanitize input, | |
915 | - avoiding the vast majority of these problems. If the problems | |
916 | - don't apply to you, pass in False for markupMassage, and | |
917 | - you'll get better performance. | |
918 | - | |
919 | - The default parser massage techniques fix the two most common | |
920 | - instances of invalid HTML that choke sgmllib: | |
921 | - | |
922 | - <br/> (No space between name of closing tag and tag close) | |
923 | - <! --Comment--> (Extraneous whitespace in declaration) | |
924 | - | |
925 | - You can pass in a custom list of (RE object, replace method) | |
926 | - tuples to get Beautiful Soup to scrub your input the way you | |
927 | - want.""" | |
928 | - | |
929 | - self.parseOnlyThese = parseOnlyThese | |
930 | - self.fromEncoding = fromEncoding | |
931 | - self.smartQuotesTo = smartQuotesTo | |
932 | - self.convertEntities = convertEntities | |
933 | - if self.convertEntities: | |
934 | - # It doesn't make sense to convert encoded characters to | |
935 | - # entities even while you're converting entities to Unicode. | |
936 | - # Just convert it all to Unicode. | |
937 | - self.smartQuotesTo = None | |
938 | - self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) | |
939 | - SGMLParser.__init__(self) | |
940 | - | |
941 | - if hasattr(markup, 'read'): # It's a file-type object. | |
942 | - markup = markup.read() | |
943 | - self.markup = markup | |
944 | - self.markupMassage = markupMassage | |
945 | - try: | |
946 | - self._feed() | |
947 | - except StopParsing: | |
948 | - pass | |
949 | - self.markup = None # The markup can now be GCed | |
950 | - | |
951 | - def _feed(self, inDocumentEncoding=None): | |
952 | - # Convert the document to Unicode. | |
953 | - markup = self.markup | |
954 | - if isinstance(markup, unicode): | |
955 | - if not hasattr(self, 'originalEncoding'): | |
956 | - self.originalEncoding = None | |
957 | - else: | |
958 | - dammit = UnicodeDammit\ | |
959 | - (markup, [self.fromEncoding, inDocumentEncoding], | |
960 | - smartQuotesTo=self.smartQuotesTo) | |
961 | - markup = dammit.unicode | |
962 | - self.originalEncoding = dammit.originalEncoding | |
963 | - if markup: | |
964 | - if self.markupMassage: | |
965 | - if not isList(self.markupMassage): | |
966 | - self.markupMassage = self.MARKUP_MASSAGE | |
967 | - for fix, m in self.markupMassage: | |
968 | - markup = fix.sub(m, markup) | |
969 | - self.reset() | |
970 | - | |
971 | - SGMLParser.feed(self, markup) | |
972 | - # Close out any unfinished strings and close all the open tags. | |
973 | - self.endData() | |
974 | - while self.currentTag.name != self.ROOT_TAG_NAME: | |
975 | - self.popTag() | |
976 | - | |
977 | - def __getattr__(self, methodName): | |
978 | - """This method routes method call requests to either the SGMLParser | |
979 | - superclass or the Tag superclass, depending on the method name.""" | |
980 | - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) | |
981 | - | |
982 | - if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ | |
983 | - or methodName.find('do_') == 0: | |
984 | - return SGMLParser.__getattr__(self, methodName) | |
985 | - elif methodName.find('__') != 0: | |
986 | - return Tag.__getattr__(self, methodName) | |
987 | - else: | |
988 | - raise AttributeError | |
989 | - | |
990 | - def isSelfClosingTag(self, name): | |
991 | - """Returns true iff the given string is the name of a | |
992 | - self-closing tag according to this parser.""" | |
993 | - return self.SELF_CLOSING_TAGS.has_key(name) \ | |
994 | - or self.instanceSelfClosingTags.has_key(name) | |
995 | - | |
996 | - def reset(self): | |
997 | - Tag.__init__(self, self, self.ROOT_TAG_NAME) | |
998 | - self.hidden = 1 | |
999 | - SGMLParser.reset(self) | |
1000 | - self.currentData = [] | |
1001 | - self.currentTag = None | |
1002 | - self.tagStack = [] | |
1003 | - self.quoteStack = [] | |
1004 | - self.pushTag(self) | |
1005 | - | |
1006 | - def popTag(self): | |
1007 | - tag = self.tagStack.pop() | |
1008 | - # Tags with just one string-owning child get the child as a | |
1009 | - # 'string' property, so that soup.tag.string is shorthand for | |
1010 | - # soup.tag.contents[0] | |
1011 | - if len(self.currentTag.contents) == 1 and \ | |
1012 | - isinstance(self.currentTag.contents[0], NavigableString): | |
1013 | - self.currentTag.string = self.currentTag.contents[0] | |
1014 | - | |
1015 | - #print "Pop", tag.name | |
1016 | - if self.tagStack: | |
1017 | - self.currentTag = self.tagStack[-1] | |
1018 | - return self.currentTag | |
1019 | - | |
1020 | - def pushTag(self, tag): | |
1021 | - #print "Push", tag.name | |
1022 | - if self.currentTag: | |
1023 | - self.currentTag.append(tag) | |
1024 | - self.tagStack.append(tag) | |
1025 | - self.currentTag = self.tagStack[-1] | |
1026 | - | |
1027 | - def endData(self, containerClass=NavigableString): | |
1028 | - if self.currentData: | |
1029 | - currentData = ''.join(self.currentData) | |
1030 | - if not currentData.strip(): | |
1031 | - if '\n' in currentData: | |
1032 | - currentData = '\n' | |
1033 | - else: | |
1034 | - currentData = ' ' | |
1035 | - self.currentData = [] | |
1036 | - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ | |
1037 | - (not self.parseOnlyThese.text or \ | |
1038 | - not self.parseOnlyThese.search(currentData)): | |
1039 | - return | |
1040 | - o = containerClass(currentData) | |
1041 | - o.setup(self.currentTag, self.previous) | |
1042 | - if self.previous: | |
1043 | - self.previous.next = o | |
1044 | - self.previous = o | |
1045 | - self.currentTag.contents.append(o) | |
1046 | - | |
1047 | - | |
1048 | - def _popToTag(self, name, inclusivePop=True): | |
1049 | - """Pops the tag stack up to and including the most recent | |
1050 | - instance of the given tag. If inclusivePop is false, pops the tag | |
1051 | - stack up to but *not* including the most recent instqance of | |
1052 | - the given tag.""" | |
1053 | - #print "Popping to %s" % name | |
1054 | - if name == self.ROOT_TAG_NAME: | |
1055 | - return | |
1056 | - | |
1057 | - numPops = 0 | |
1058 | - mostRecentTag = None | |
1059 | - for i in range(len(self.tagStack)-1, 0, -1): | |
1060 | - if name == self.tagStack[i].name: | |
1061 | - numPops = len(self.tagStack)-i | |
1062 | - break | |
1063 | - if not inclusivePop: | |
1064 | - numPops = numPops - 1 | |
1065 | - | |
1066 | - for i in range(0, numPops): | |
1067 | - mostRecentTag = self.popTag() | |
1068 | - return mostRecentTag | |
1069 | - | |
1070 | - def _smartPop(self, name): | |
1071 | - | |
1072 | - """We need to pop up to the previous tag of this type, unless | |
1073 | - one of this tag's nesting reset triggers comes between this | |
1074 | - tag and the previous tag of this type, OR unless this tag is a | |
1075 | - generic nesting trigger and another generic nesting trigger | |
1076 | - comes between this tag and the previous tag of this type. | |
1077 | - | |
1078 | - Examples: | |
1079 | - <p>Foo<b>Bar<p> should pop to 'p', not 'b'. | |
1080 | - <p>Foo<table>Bar<p> should pop to 'table', not 'p'. | |
1081 | - <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. | |
1082 | - <p>Foo<b>Bar<p> should pop to 'p', not 'b'. | |
1083 | - | |
1084 | - <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. | |
1085 | - <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' | |
1086 | - <td><tr><td> *<td>* should pop to 'tr', not the first 'td' | |
1087 | - """ | |
1088 | - | |
1089 | - nestingResetTriggers = self.NESTABLE_TAGS.get(name) | |
1090 | - isNestable = nestingResetTriggers != None | |
1091 | - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) | |
1092 | - popTo = None | |
1093 | - inclusive = True | |
1094 | - for i in range(len(self.tagStack)-1, 0, -1): | |
1095 | - p = self.tagStack[i] | |
1096 | - if (not p or p.name == name) and not isNestable: | |
1097 | - #Non-nestable tags get popped to the top or to their | |
1098 | - #last occurance. | |
1099 | - popTo = name | |
1100 | - break | |
1101 | - if (nestingResetTriggers != None | |
1102 | - and p.name in nestingResetTriggers) \ | |
1103 | - or (nestingResetTriggers == None and isResetNesting | |
1104 | - and self.RESET_NESTING_TAGS.has_key(p.name)): | |
1105 | - | |
1106 | - #If we encounter one of the nesting reset triggers | |
1107 | - #peculiar to this tag, or we encounter another tag | |
1108 | - #that causes nesting to reset, pop up to but not | |
1109 | - #including that tag. | |
1110 | - popTo = p.name | |
1111 | - inclusive = False | |
1112 | - break | |
1113 | - p = p.parent | |
1114 | - if popTo: | |
1115 | - self._popToTag(popTo, inclusive) | |
1116 | - | |
1117 | - def unknown_starttag(self, name, attrs, selfClosing=0): | |
1118 | - #print "Start tag %s: %s" % (name, attrs) | |
1119 | - if self.quoteStack: | |
1120 | - #This is not a real tag. | |
1121 | - #print "<%s> is not real!" % name | |
1122 | - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) | |
1123 | - self.handle_data('<%s%s>' % (name, attrs)) | |
1124 | - return | |
1125 | - self.endData() | |
1126 | - | |
1127 | - if not self.isSelfClosingTag(name) and not selfClosing: | |
1128 | - self._smartPop(name) | |
1129 | - | |
1130 | - if self.parseOnlyThese and len(self.tagStack) <= 1 \ | |
1131 | - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): | |
1132 | - return | |
1133 | - | |
1134 | - tag = Tag(self, name, attrs, self.currentTag, self.previous) | |
1135 | - if self.previous: | |
1136 | - self.previous.next = tag | |
1137 | - self.previous = tag | |
1138 | - self.pushTag(tag) | |
1139 | - if selfClosing or self.isSelfClosingTag(name): | |
1140 | - self.popTag() | |
1141 | - if name in self.QUOTE_TAGS: | |
1142 | - #print "Beginning quote (%s)" % name | |
1143 | - self.quoteStack.append(name) | |
1144 | - self.literal = 1 | |
1145 | - return tag | |
1146 | - | |
1147 | - def unknown_endtag(self, name): | |
1148 | - #print "End tag %s" % name | |
1149 | - if self.quoteStack and self.quoteStack[-1] != name: | |
1150 | - #This is not a real end tag. | |
1151 | - #print "</%s> is not real!" % name | |
1152 | - self.handle_data('</%s>' % name) | |
1153 | - return | |
1154 | - self.endData() | |
1155 | - self._popToTag(name) | |
1156 | - if self.quoteStack and self.quoteStack[-1] == name: | |
1157 | - self.quoteStack.pop() | |
1158 | - self.literal = (len(self.quoteStack) > 0) | |
1159 | - | |
1160 | - def handle_data(self, data): | |
1161 | - self.currentData.append(data) | |
1162 | - | |
1163 | - def _toStringSubclass(self, text, subclass): | |
1164 | - """Adds a certain piece of text to the tree as a NavigableString | |
1165 | - subclass.""" | |
1166 | - self.endData() | |
1167 | - self.handle_data(text) | |
1168 | - self.endData(subclass) | |
1169 | - | |
1170 | - def handle_pi(self, text): | |
1171 | - """Handle a processing instruction as a ProcessingInstruction | |
1172 | - object, possibly one with a %SOUP-ENCODING% slot into which an | |
1173 | - encoding will be plugged later.""" | |
1174 | - if text[:3] == "xml": | |
1175 | - text = "xml version='1.0' encoding='%SOUP-ENCODING%'" | |
1176 | - self._toStringSubclass(text, ProcessingInstruction) | |
1177 | - | |
1178 | - def handle_comment(self, text): | |
1179 | - "Handle comments as Comment objects." | |
1180 | - self._toStringSubclass(text, Comment) | |
1181 | - | |
1182 | - def handle_charref(self, ref): | |
1183 | - "Handle character references as data." | |
1184 | - if self.convertEntities in [self.HTML_ENTITIES, | |
1185 | - self.XML_ENTITIES]: | |
1186 | - data = unichr(int(ref)) | |
1187 | - else: | |
1188 | - data = '&#%s;' % ref | |
1189 | - self.handle_data(data) | |
1190 | - | |
1191 | - def handle_entityref(self, ref): | |
1192 | - """Handle entity references as data, possibly converting known | |
1193 | - HTML entity references to the corresponding Unicode | |
1194 | - characters.""" | |
1195 | - data = None | |
1196 | - if self.convertEntities == self.HTML_ENTITIES or \ | |
1197 | - (self.convertEntities == self.XML_ENTITIES and \ | |
1198 | - self.XML_ENTITY_LIST.get(ref)): | |
1199 | - try: | |
1200 | - data = unichr(name2codepoint[ref]) | |
1201 | - except KeyError: | |
1202 | - pass | |
1203 | - if not data: | |
1204 | - data = '&%s;' % ref | |
1205 | - self.handle_data(data) | |
1206 | - | |
1207 | - def handle_decl(self, data): | |
1208 | - "Handle DOCTYPEs and the like as Declaration objects." | |
1209 | - self._toStringSubclass(data, Declaration) | |
1210 | - | |
1211 | - def parse_declaration(self, i): | |
1212 | - """Treat a bogus SGML declaration as raw data. Treat a CDATA | |
1213 | - declaration as a CData object.""" | |
1214 | - j = None | |
1215 | - if self.rawdata[i:i+9] == '<![CDATA[': | |
1216 | - k = self.rawdata.find(']]>', i) | |
1217 | - if k == -1: | |
1218 | - k = len(self.rawdata) | |
1219 | - data = self.rawdata[i+9:k] | |
1220 | - j = k+3 | |
1221 | - self._toStringSubclass(data, CData) | |
1222 | - else: | |
1223 | - try: | |
1224 | - j = SGMLParser.parse_declaration(self, i) | |
1225 | - except SGMLParseError: | |
1226 | - toHandle = self.rawdata[i:] | |
1227 | - self.handle_data(toHandle) | |
1228 | - j = i + len(toHandle) | |
1229 | - return j | |
1230 | - | |
1231 | -class BeautifulSoup(BeautifulStoneSoup): | |
1232 | - | |
1233 | - """This parser knows the following facts about HTML: | |
1234 | - | |
1235 | - * Some tags have no closing tag and should be interpreted as being | |
1236 | - closed as soon as they are encountered. | |
1237 | - | |
1238 | - * The text inside some tags (ie. 'script') may contain tags which | |
1239 | - are not really part of the document and which should be parsed | |
1240 | - as text, not tags. If you want to parse the text as tags, you can | |
1241 | - always fetch it and parse it explicitly. | |
1242 | - | |
1243 | - * Tag nesting rules: | |
1244 | - | |
1245 | - Most tags can't be nested at all. For instance, the occurance of | |
1246 | - a <p> tag should implicitly close the previous <p> tag. | |
1247 | - | |
1248 | - <p>Para1<p>Para2 | |
1249 | - should be transformed into: | |
1250 | - <p>Para1</p><p>Para2 | |
1251 | - | |
1252 | - Some tags can be nested arbitrarily. For instance, the occurance | |
1253 | - of a <blockquote> tag should _not_ implicitly close the previous | |
1254 | - <blockquote> tag. | |
1255 | - | |
1256 | - Alice said: <blockquote>Bob said: <blockquote>Blah | |
1257 | - should NOT be transformed into: | |
1258 | - Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah | |
1259 | - | |
1260 | - Some tags can be nested, but the nesting is reset by the | |
1261 | - interposition of other tags. For instance, a <tr> tag should | |
1262 | - implicitly close the previous <tr> tag within the same <table>, | |
1263 | - but not close a <tr> tag in another table. | |
1264 | - | |
1265 | - <table><tr>Blah<tr>Blah | |
1266 | - should be transformed into: | |
1267 | - <table><tr>Blah</tr><tr>Blah | |
1268 | - but, | |
1269 | - <tr>Blah<table><tr>Blah | |
1270 | - should NOT be transformed into | |
1271 | - <tr>Blah<table></tr><tr>Blah | |
1272 | - | |
1273 | - Differing assumptions about tag nesting rules are a major source | |
1274 | - of problems with the BeautifulSoup class. If BeautifulSoup is not | |
1275 | - treating as nestable a tag your page author treats as nestable, | |
1276 | - try ICantBelieveItsBeautifulSoup, MinimalSoup, or | |
1277 | - BeautifulStoneSoup before writing your own subclass.""" | |
1278 | - | |
1279 | - def __init__(self, *args, **kwargs): | |
1280 | - if not kwargs.has_key('smartQuotesTo'): | |
1281 | - kwargs['smartQuotesTo'] = self.HTML_ENTITIES | |
1282 | - BeautifulStoneSoup.__init__(self, *args, **kwargs) | |
1283 | - | |
1284 | - SELF_CLOSING_TAGS = buildTagMap(None, | |
1285 | - ['br' , 'hr', 'input', 'img', 'meta', | |
1286 | - 'spacer', 'link', 'frame', 'base']) | |
1287 | - | |
1288 | - QUOTE_TAGS = {'script': None} | |
1289 | - | |
1290 | - #According to the HTML standard, each of these inline tags can | |
1291 | - #contain another tag of the same type. Furthermore, it's common | |
1292 | - #to actually use these tags this way. | |
1293 | - NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', | |
1294 | - 'center'] | |
1295 | - | |
1296 | - #According to the HTML standard, these block tags can contain | |
1297 | - #another tag of the same type. Furthermore, it's common | |
1298 | - #to actually use these tags this way. | |
1299 | - NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] | |
1300 | - | |
1301 | - #Lists can contain other lists, but there are restrictions. | |
1302 | - NESTABLE_LIST_TAGS = { 'ol' : [], | |
1303 | - 'ul' : [], | |
1304 | - 'li' : ['ul', 'ol'], | |
1305 | - 'dl' : [], | |
1306 | - 'dd' : ['dl'], | |
1307 | - 'dt' : ['dl'] } | |
1308 | - | |
1309 | - #Tables can contain other tables, but there are restrictions. | |
1310 | - NESTABLE_TABLE_TAGS = {'table' : [], | |
1311 | - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], | |
1312 | - 'td' : ['tr'], | |
1313 | - 'th' : ['tr'], | |
1314 | - 'thead' : ['table'], | |
1315 | - 'tbody' : ['table'], | |
1316 | - 'tfoot' : ['table'], | |
1317 | - } | |
1318 | - | |
1319 | - NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] | |
1320 | - | |
1321 | - #If one of these tags is encountered, all tags up to the next tag of | |
1322 | - #this type are popped. | |
1323 | - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', | |
1324 | - NON_NESTABLE_BLOCK_TAGS, | |
1325 | - NESTABLE_LIST_TAGS, | |
1326 | - NESTABLE_TABLE_TAGS) | |
1327 | - | |
1328 | - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, | |
1329 | - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) | |
1330 | - | |
1331 | - # Used to detect the charset in a META tag; see start_meta | |
1332 | - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") | |
1333 | - | |
1334 | - def start_meta(self, attrs): | |
1335 | - """Beautiful Soup can detect a charset included in a META tag, | |
1336 | - try to convert the document to that charset, and re-parse the | |
1337 | - document from the beginning.""" | |
1338 | - httpEquiv = None | |
1339 | - contentType = None | |
1340 | - contentTypeIndex = None | |
1341 | - tagNeedsEncodingSubstitution = False | |
1342 | - | |
1343 | - for i in range(0, len(attrs)): | |
1344 | - key, value = attrs[i] | |
1345 | - key = key.lower() | |
1346 | - if key == 'http-equiv': | |
1347 | - httpEquiv = value | |
1348 | - elif key == 'content': | |
1349 | - contentType = value | |
1350 | - contentTypeIndex = i | |
1351 | - | |
1352 | - if httpEquiv and contentType: # It's an interesting meta tag. | |
1353 | - match = self.CHARSET_RE.search(contentType) | |
1354 | - if match: | |
1355 | - if getattr(self, 'declaredHTMLEncoding') or \ | |
1356 | - (self.originalEncoding == self.fromEncoding): | |
1357 | - # This is our second pass through the document, or | |
1358 | - # else an encoding was specified explicitly and it | |
1359 | - # worked. Rewrite the meta tag. | |
1360 | - newAttr = self.CHARSET_RE.sub\ | |
1361 | - (lambda(match):match.group(1) + | |
1362 | - "%SOUP-ENCODING%", value) | |
1363 | - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], | |
1364 | - newAttr) | |
1365 | - tagNeedsEncodingSubstitution = True | |
1366 | - else: | |
1367 | - # This is our first pass through the document. | |
1368 | - # Go through it again with the new information. | |
1369 | - newCharset = match.group(3) | |
1370 | - if newCharset and newCharset != self.originalEncoding: | |
1371 | - self.declaredHTMLEncoding = newCharset | |
1372 | - self._feed(self.declaredHTMLEncoding) | |
1373 | - raise StopParsing | |
1374 | - tag = self.unknown_starttag("meta", attrs) | |
1375 | - if tag and tagNeedsEncodingSubstitution: | |
1376 | - tag.containsSubstitutions = True | |
1377 | - | |
1378 | -class StopParsing(Exception): | |
1379 | - pass | |
1380 | - | |
1381 | -class ICantBelieveItsBeautifulSoup(BeautifulSoup): | |
1382 | - | |
1383 | - """The BeautifulSoup class is oriented towards skipping over | |
1384 | - common HTML errors like unclosed tags. However, sometimes it makes | |
1385 | - errors of its own. For instance, consider this fragment: | |
1386 | - | |
1387 | - <b>Foo<b>Bar</b></b> | |
1388 | - | |
1389 | - This is perfectly valid (if bizarre) HTML. However, the | |
1390 | - BeautifulSoup class will implicitly close the first b tag when it | |
1391 | - encounters the second 'b'. It will think the author wrote | |
1392 | - "<b>Foo<b>Bar", and didn't close the first 'b' tag, because | |
1393 | - there's no real-world reason to bold something that's already | |
1394 | - bold. When it encounters '</b></b>' it will close two more 'b' | |
1395 | - tags, for a grand total of three tags closed instead of two. This | |
1396 | - can throw off the rest of your document structure. The same is | |
1397 | - true of a number of other tags, listed below. | |
1398 | - | |
1399 | - It's much more common for someone to forget to close a 'b' tag | |
1400 | - than to actually use nested 'b' tags, and the BeautifulSoup class | |
1401 | - handles the common case. This class handles the not-co-common | |
1402 | - case: where you can't believe someone wrote what they did, but | |
1403 | - it's valid HTML and BeautifulSoup screwed up by assuming it | |
1404 | - wouldn't be.""" | |
1405 | - | |
1406 | - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ | |
1407 | - ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', | |
1408 | - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', | |
1409 | - 'big'] | |
1410 | - | |
1411 | - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] | |
1412 | - | |
1413 | - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, | |
1414 | - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, | |
1415 | - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) | |
1416 | - | |
1417 | -class MinimalSoup(BeautifulSoup): | |
1418 | - """The MinimalSoup class is for parsing HTML that contains | |
1419 | - pathologically bad markup. It makes no assumptions about tag | |
1420 | - nesting, but it does know which tags are self-closing, that | |
1421 | - <script> tags contain Javascript and should not be parsed, that | |
1422 | - META tags may contain encoding information, and so on. | |
1423 | - | |
1424 | - This also makes it better for subclassing than BeautifulStoneSoup | |
1425 | - or BeautifulSoup.""" | |
1426 | - | |
1427 | - RESET_NESTING_TAGS = buildTagMap('noscript') | |
1428 | - NESTABLE_TAGS = {} | |
1429 | - | |
1430 | -class BeautifulSOAP(BeautifulStoneSoup): | |
1431 | - """This class will push a tag with only a single string child into | |
1432 | - the tag's parent as an attribute. The attribute's name is the tag | |
1433 | - name, and the value is the string child. An example should give | |
1434 | - the flavor of the change: | |
1435 | - | |
1436 | - <foo><bar>baz</bar></foo> | |
1437 | - => | |
1438 | - <foo bar="baz"><bar>baz</bar></foo> | |
1439 | - | |
1440 | - You can then access fooTag['bar'] instead of fooTag.barTag.string. | |
1441 | - | |
1442 | - This is, of course, useful for scraping structures that tend to | |
1443 | - use subelements instead of attributes, such as SOAP messages. Note | |
1444 | - that it modifies its input, so don't print the modified version | |
1445 | - out. | |
1446 | - | |
1447 | - I'm not sure how many people really want to use this class; let me | |
1448 | - know if you do. Mainly I like the name.""" | |
1449 | - | |
1450 | - def popTag(self): | |
1451 | - if len(self.tagStack) > 1: | |
1452 | - tag = self.tagStack[-1] | |
1453 | - parent = self.tagStack[-2] | |
1454 | - parent._getAttrMap() | |
1455 | - if (isinstance(tag, Tag) and len(tag.contents) == 1 and | |
1456 | - isinstance(tag.contents[0], NavigableString) and | |
1457 | - not parent.attrMap.has_key(tag.name)): | |
1458 | - parent[tag.name] = tag.contents[0] | |
1459 | - BeautifulStoneSoup.popTag(self) | |
1460 | - | |
1461 | -#Enterprise class names! It has come to our attention that some people | |
1462 | -#think the names of the Beautiful Soup parser classes are too silly | |
1463 | -#and "unprofessional" for use in enterprise screen-scraping. We feel | |
1464 | -#your pain! For such-minded folk, the Beautiful Soup Consortium And | |
1465 | -#All-Night Kosher Bakery recommends renaming this file to | |
1466 | -#"RobustParser.py" (or, in cases of extreme enterprisness, | |
1467 | -#"RobustParserBeanInterface.class") and using the following | |
1468 | -#enterprise-friendly class aliases: | |
1469 | -class RobustXMLParser(BeautifulStoneSoup): | |
1470 | - pass | |
1471 | -class RobustHTMLParser(BeautifulSoup): | |
1472 | - pass | |
1473 | -class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): | |
1474 | - pass | |
1475 | -class RobustInsanelyWackAssHTMLParser(MinimalSoup): | |
1476 | - pass | |
1477 | -class SimplifyingSOAPParser(BeautifulSOAP): | |
1478 | - pass | |
1479 | - | |
1480 | -###################################################### | |
1481 | -# | |
1482 | -# Bonus library: Unicode, Dammit | |
1483 | -# | |
1484 | -# This class forces XML data into a standard format (usually to UTF-8 | |
1485 | -# or Unicode). It is heavily based on code from Mark Pilgrim's | |
1486 | -# Universal Feed Parser. It does not rewrite the XML or HTML to | |
1487 | -# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi | |
1488 | -# (XML) and BeautifulSoup.start_meta (HTML). | |
1489 | - | |
1490 | -# Autodetects character encodings. | |
1491 | -# Download from http://chardet.feedparser.org/ | |
1492 | -try: | |
1493 | - import chardet | |
1494 | -# import chardet.constants | |
1495 | -# chardet.constants._debug = 1 | |
1496 | -except: | |
1497 | - chardet = None | |
1498 | -chardet = None | |
1499 | - | |
1500 | -# cjkcodecs and iconv_codec make Python know about more character encodings. | |
1501 | -# Both are available from http://cjkpython.i18n.org/ | |
1502 | -# They're built in if you use Python 2.4. | |
1503 | -try: | |
1504 | - import cjkcodecs.aliases | |
1505 | -except: | |
1506 | - pass | |
1507 | -try: | |
1508 | - import iconv_codec | |
1509 | -except: | |
1510 | - pass | |
1511 | - | |
1512 | -class UnicodeDammit: | |
1513 | - """A class for detecting the encoding of a *ML document and | |
1514 | - converting it to a Unicode string. If the source encoding is | |
1515 | - windows-1252, can replace MS smart quotes with their HTML or XML | |
1516 | - equivalents.""" | |
1517 | - | |
1518 | - # This dictionary maps commonly seen values for "charset" in HTML | |
1519 | - # meta tags to the corresponding Python codec names. It only covers | |
1520 | - # values that aren't in Python's aliases and can't be determined | |
1521 | - # by the heuristics in find_codec. | |
1522 | - CHARSET_ALIASES = { "macintosh" : "mac-roman", | |
1523 | - "x-sjis" : "shift-jis" } | |
1524 | - | |
1525 | - def __init__(self, markup, overrideEncodings=[], | |
1526 | - smartQuotesTo='xml'): | |
1527 | - self.markup, documentEncoding, sniffedEncoding = \ | |
1528 | - self._detectEncoding(markup) | |
1529 | - self.smartQuotesTo = smartQuotesTo | |
1530 | - self.triedEncodings = [] | |
1531 | - if markup == '' or isinstance(markup, unicode): | |
1532 | - self.originalEncoding = None | |
1533 | - self.unicode = unicode(markup) | |
1534 | - return | |
1535 | - | |
1536 | - u = None | |
1537 | - for proposedEncoding in overrideEncodings: | |
1538 | - u = self._convertFrom(proposedEncoding) | |
1539 | - if u: break | |
1540 | - if not u: | |
1541 | - for proposedEncoding in (documentEncoding, sniffedEncoding): | |
1542 | - u = self._convertFrom(proposedEncoding) | |
1543 | - if u: break | |
1544 | - | |
1545 | - # If no luck and we have auto-detection library, try that: | |
1546 | - if not u and chardet and not isinstance(self.markup, unicode): | |
1547 | - u = self._convertFrom(chardet.detect(self.markup)['encoding']) | |
1548 | - | |
1549 | - # As a last resort, try utf-8 and windows-1252: | |
1550 | - if not u: | |
1551 | - for proposed_encoding in ("utf-8", "windows-1252"): | |
1552 | - u = self._convertFrom(proposed_encoding) | |
1553 | - if u: break | |
1554 | - self.unicode = u | |
1555 | - if not u: self.originalEncoding = None | |
1556 | - | |
1557 | - def _subMSChar(self, orig): | |
1558 | - """Changes a MS smart quote character to an XML or HTML | |
1559 | - entity.""" | |
1560 | - sub = self.MS_CHARS.get(orig) | |
1561 | - if type(sub) == types.TupleType: | |
1562 | - if self.smartQuotesTo == 'xml': | |
1563 | - sub = '&#x%s;' % sub[1] | |
1564 | - else: | |
1565 | - sub = '&%s;' % sub[0] | |
1566 | - return sub | |
1567 | - | |
1568 | - def _convertFrom(self, proposed): | |
1569 | - proposed = self.find_codec(proposed) | |
1570 | - if not proposed or proposed in self.triedEncodings: | |
1571 | - return None | |
1572 | - self.triedEncodings.append(proposed) | |
1573 | - markup = self.markup | |
1574 | - | |
1575 | - # Convert smart quotes to HTML if coming from an encoding | |
1576 | - # that might have them. | |
1577 | - if self.smartQuotesTo and proposed.lower() in("windows-1252", | |
1578 | - "iso-8859-1", | |
1579 | - "iso-8859-2"): | |
1580 | - markup = re.compile("([\x80-\x9f])").sub \ | |
1581 | - (lambda(x): self._subMSChar(x.group(1)), | |
1582 | - markup) | |
1583 | - | |
1584 | - try: | |
1585 | - # print "Trying to convert document to %s" % proposed | |
1586 | - u = self._toUnicode(markup, proposed) | |
1587 | - self.markup = u | |
1588 | - self.originalEncoding = proposed | |
1589 | - except Exception, e: | |
1590 | - # print "That didn't work!" | |
1591 | - # print e | |
1592 | - return None | |
1593 | - #print "Correct encoding: %s" % proposed | |
1594 | - return self.markup | |
1595 | - | |
1596 | - def _toUnicode(self, data, encoding): | |
1597 | - '''Given a string and its encoding, decodes the string into Unicode. | |
1598 | - %encoding is a string recognized by encodings.aliases''' | |
1599 | - | |
1600 | - # strip Byte Order Mark (if present) | |
1601 | - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ | |
1602 | - and (data[2:4] != '\x00\x00'): | |
1603 | - encoding = 'utf-16be' | |
1604 | - data = data[2:] | |
1605 | - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ | |
1606 | - and (data[2:4] != '\x00\x00'): | |
1607 | - encoding = 'utf-16le' | |
1608 | - data = data[2:] | |
1609 | - elif data[:3] == '\xef\xbb\xbf': | |
1610 | - encoding = 'utf-8' | |
1611 | - data = data[3:] | |
1612 | - elif data[:4] == '\x00\x00\xfe\xff': | |
1613 | - encoding = 'utf-32be' | |
1614 | - data = data[4:] | |
1615 | - elif data[:4] == '\xff\xfe\x00\x00': | |
1616 | - encoding = 'utf-32le' | |
1617 | - data = data[4:] | |
1618 | - newdata = unicode(data, encoding) | |
1619 | - return newdata | |
1620 | - | |
1621 | - def _detectEncoding(self, xml_data): | |
1622 | - """Given a document, tries to detect its XML encoding.""" | |
1623 | - xml_encoding = sniffed_xml_encoding = None | |
1624 | - try: | |
1625 | - if xml_data[:4] == '\x4c\x6f\xa7\x94': | |
1626 | - # EBCDIC | |
1627 | - xml_data = self._ebcdic_to_ascii(xml_data) | |
1628 | - elif xml_data[:4] == '\x00\x3c\x00\x3f': | |
1629 | - # UTF-16BE | |
1630 | - sniffed_xml_encoding = 'utf-16be' | |
1631 | - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') | |
1632 | - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ | |
1633 | - and (xml_data[2:4] != '\x00\x00'): | |
1634 | - # UTF-16BE with BOM | |
1635 | - sniffed_xml_encoding = 'utf-16be' | |
1636 | - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') | |
1637 | - elif xml_data[:4] == '\x3c\x00\x3f\x00': | |
1638 | - # UTF-16LE | |
1639 | - sniffed_xml_encoding = 'utf-16le' | |
1640 | - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') | |
1641 | - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ | |
1642 | - (xml_data[2:4] != '\x00\x00'): | |
1643 | - # UTF-16LE with BOM | |
1644 | - sniffed_xml_encoding = 'utf-16le' | |
1645 | - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') | |
1646 | - elif xml_data[:4] == '\x00\x00\x00\x3c': | |
1647 | - # UTF-32BE | |
1648 | - sniffed_xml_encoding = 'utf-32be' | |
1649 | - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') | |
1650 | - elif xml_data[:4] == '\x3c\x00\x00\x00': | |
1651 | - # UTF-32LE | |
1652 | - sniffed_xml_encoding = 'utf-32le' | |
1653 | - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') | |
1654 | - elif xml_data[:4] == '\x00\x00\xfe\xff': | |
1655 | - # UTF-32BE with BOM | |
1656 | - sniffed_xml_encoding = 'utf-32be' | |
1657 | - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') | |
1658 | - elif xml_data[:4] == '\xff\xfe\x00\x00': | |
1659 | - # UTF-32LE with BOM | |
1660 | - sniffed_xml_encoding = 'utf-32le' | |
1661 | - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') | |
1662 | - elif xml_data[:3] == '\xef\xbb\xbf': | |
1663 | - # UTF-8 with BOM | |
1664 | - sniffed_xml_encoding = 'utf-8' | |
1665 | - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') | |
1666 | - else: | |
1667 | - sniffed_xml_encoding = 'ascii' | |
1668 | - pass | |
1669 | - xml_encoding_match = re.compile \ | |
1670 | - ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ | |
1671 | - .match(xml_data) | |
1672 | - except: | |
1673 | - xml_encoding_match = None | |
1674 | - if xml_encoding_match: | |
1675 | - xml_encoding = xml_encoding_match.groups()[0].lower() | |
1676 | - if sniffed_xml_encoding and \ | |
1677 | - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', | |
1678 | - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', | |
1679 | - 'utf-16', 'utf-32', 'utf_16', 'utf_32', | |
1680 | - 'utf16', 'u16')): | |
1681 | - xml_encoding = sniffed_xml_encoding | |
1682 | - return xml_data, xml_encoding, sniffed_xml_encoding | |
1683 | - | |
1684 | - | |
1685 | - def find_codec(self, charset): | |
1686 | - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ | |
1687 | - or (charset and self._codec(charset.replace("-", ""))) \ | |
1688 | - or (charset and self._codec(charset.replace("-", "_"))) \ | |
1689 | - or charset | |
1690 | - | |
1691 | - def _codec(self, charset): | |
1692 | - if not charset: return charset | |
1693 | - codec = None | |
1694 | - try: | |
1695 | - codecs.lookup(charset) | |
1696 | - codec = charset | |
1697 | - except LookupError: | |
1698 | - pass | |
1699 | - return codec | |
1700 | - | |
1701 | - EBCDIC_TO_ASCII_MAP = None | |
1702 | - def _ebcdic_to_ascii(self, s): | |
1703 | - c = self.__class__ | |
1704 | - if not c.EBCDIC_TO_ASCII_MAP: | |
1705 | - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, | |
1706 | - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, | |
1707 | - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, | |
1708 | - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, | |
1709 | - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, | |
1710 | - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, | |
1711 | - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, | |
1712 | - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, | |
1713 | - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, | |
1714 | - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, | |
1715 | - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, | |
1716 | - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, | |
1717 | - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, | |
1718 | - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, | |
1719 | - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, | |
1720 | - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, | |
1721 | - 250,251,252,253,254,255) | |
1722 | - import string | |
1723 | - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ | |
1724 | - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) | |
1725 | - return s.translate(c.EBCDIC_TO_ASCII_MAP) | |
1726 | - | |
1727 | - MS_CHARS = { '\x80' : ('euro', '20AC'), | |
1728 | - '\x81' : ' ', | |
1729 | - '\x82' : ('sbquo', '201A'), | |
1730 | - '\x83' : ('fnof', '192'), | |
1731 | - '\x84' : ('bdquo', '201E'), | |
1732 | - '\x85' : ('hellip', '2026'), | |
1733 | - '\x86' : ('dagger', '2020'), | |
1734 | - '\x87' : ('Dagger', '2021'), | |
1735 | - '\x88' : ('circ', '2C6'), | |
1736 | - '\x89' : ('permil', '2030'), | |
1737 | - '\x8A' : ('Scaron', '160'), | |
1738 | - '\x8B' : ('lsaquo', '2039'), | |
1739 | - '\x8C' : ('OElig', '152'), | |
1740 | - '\x8D' : '?', | |
1741 | - '\x8E' : ('#x17D', '17D'), | |
1742 | - '\x8F' : '?', | |
1743 | - '\x90' : '?', | |
1744 | - '\x91' : ('lsquo', '2018'), | |
1745 | - '\x92' : ('rsquo', '2019'), | |
1746 | - '\x93' : ('ldquo', '201C'), | |
1747 | - '\x94' : ('rdquo', '201D'), | |
1748 | - '\x95' : ('bull', '2022'), | |
1749 | - '\x96' : ('ndash', '2013'), | |
1750 | - '\x97' : ('mdash', '2014'), | |
1751 | - '\x98' : ('tilde', '2DC'), | |
1752 | - '\x99' : ('trade', '2122'), | |
1753 | - '\x9a' : ('scaron', '161'), | |
1754 | - '\x9b' : ('rsaquo', '203A'), | |
1755 | - '\x9c' : ('oelig', '153'), | |
1756 | - '\x9d' : '?', | |
1757 | - '\x9e' : ('#x17E', '17E'), | |
1758 | - '\x9f' : ('Yuml', ''),} | |
1759 | - | |
1760 | -####################################################################### | |
1761 | - | |
1762 | - | |
1763 | -#By default, act as an HTML pretty-printer. | |
1764 | -if __name__ == '__main__': | |
1765 | - import sys | |
1766 | - soup = BeautifulSoup(sys.stdin.read()) | |
1767 | - print soup.prettify() |
pacotes/openlayers/tools/README.txt
... | ... | @@ -1,14 +0,0 @@ |
1 | -This directory contains tools used in the packaging or deployment of OpenLayers. | |
2 | - | |
3 | -Javascript minimizing tools: | |
4 | - | |
5 | - * jsmin.c, jsmin.py: | |
6 | - jsmin.py is a direct translation of the jsmin.c code into Python. jsmin.py | |
7 | - will therefore run anyplace Python runs... but at significantly slower speed. | |
8 | - | |
9 | - * shrinksafe.py | |
10 | - shrinksafe.py calls out to a third party javascript shrinking service. This | |
11 | - creates file sizes about 4% smaller (as of commit 501) of the OpenLayers | |
12 | - code. However, this also has the side effect of making you dependant on the | |
13 | - web service -- and since that service sometimes goes dead, it's risky to | |
14 | - depend on it. |
pacotes/openlayers/tools/exampleparser.py
... | ... | @@ -1,251 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | - | |
3 | -import sys | |
4 | -import os | |
5 | -import re | |
6 | -import urllib2 | |
7 | -import time | |
8 | -from xml.dom.minidom import Document | |
9 | - | |
10 | -try: | |
11 | - import xml.etree.ElementTree as ElementTree | |
12 | -except ImportError: | |
13 | - try: | |
14 | - import cElementTree as ElementTree | |
15 | - except ImportError: | |
16 | - try: | |
17 | - import elementtree.ElementTree as ElementTree | |
18 | - except ImportError: | |
19 | - import lxml.etree as ElementTree | |
20 | - | |
21 | -missing_deps = False | |
22 | -try: | |
23 | - import simplejson | |
24 | - from BeautifulSoup import BeautifulSoup | |
25 | -except ImportError, E: | |
26 | - missing_deps = E | |
27 | - | |
28 | -feedName = "example-list.xml" | |
29 | -feedPath = "http://openlayers.org/dev/examples/" | |
30 | - | |
31 | -def getListOfOnlineExamples(baseUrl): | |
32 | - """ | |
33 | - useful if you want to get a list of examples a url. not used by default. | |
34 | - """ | |
35 | - html = urllib2.urlopen(baseUrl) | |
36 | - soup = BeautifulSoup(html) | |
37 | - examples = soup.findAll('li') | |
38 | - examples = [example.find('a').get('href') for example in examples] | |
39 | - examples = [example for example in examples if example.endswith('.html')] | |
40 | - examples = [example for example in examples] | |
41 | - return examples | |
42 | - | |
43 | -def getListOfExamples(relPath): | |
44 | - """ | |
45 | - returns list of .html filenames within a given path - excludes example-list.html | |
46 | - """ | |
47 | - examples = os.listdir(relPath) | |
48 | - examples = [example for example in examples if example.endswith('.html') and example != "example-list.html"] | |
49 | - return examples | |
50 | - | |
51 | - | |
52 | -def getExampleHtml(location): | |
53 | - """ | |
54 | - returns html of a specific example that is available online or locally | |
55 | - """ | |
56 | - print '.', | |
57 | - if location.startswith('http'): | |
58 | - return urllib2.urlopen(location).read() | |
59 | - else: | |
60 | - f = open(location) | |
61 | - html = f.read() | |
62 | - f.close() | |
63 | - return html | |
64 | - | |
65 | - | |
66 | -def extractById(soup, tagId, value=None): | |
67 | - """ | |
68 | - returns full contents of a particular tag id | |
69 | - """ | |
70 | - beautifulTag = soup.find(id=tagId) | |
71 | - if beautifulTag: | |
72 | - if beautifulTag.contents: | |
73 | - value = str(beautifulTag.renderContents()).strip() | |
74 | - value = value.replace('\t','') | |
75 | - value = value.replace('\n','') | |
76 | - return value | |
77 | - | |
78 | -def getRelatedClasses(html): | |
79 | - """ | |
80 | - parses the html, and returns a list of all OpenLayers Classes | |
81 | - used within (ie what parts of OL the javascript uses). | |
82 | - """ | |
83 | - rawstr = r'''(?P<class>OpenLayers\..*?)\(''' | |
84 | - return re.findall(rawstr, html) | |
85 | - | |
86 | -def parseHtml(html,ids): | |
87 | - """ | |
88 | - returns dictionary of items of interest | |
89 | - """ | |
90 | - soup = BeautifulSoup(html) | |
91 | - d = {} | |
92 | - for tagId in ids: | |
93 | - d[tagId] = extractById(soup,tagId) | |
94 | - #classes should eventually be parsed from docs - not automatically created. | |
95 | - classes = getRelatedClasses(html) | |
96 | - d['classes'] = classes | |
97 | - return d | |
98 | - | |
99 | -def getSvnInfo(path): | |
100 | - h = os.popen("svn info %s --xml" % path) | |
101 | - tree = ElementTree.fromstring(h.read()) | |
102 | - h.close() | |
103 | - d = { | |
104 | - 'url': tree.findtext('entry/url'), | |
105 | - 'author': tree.findtext('entry/commit/author'), | |
106 | - 'date': tree.findtext('entry/commit/date') | |
107 | - } | |
108 | - return d | |
109 | - | |
110 | -def createFeed(examples): | |
111 | - doc = Document() | |
112 | - atomuri = "http://www.w3.org/2005/Atom" | |
113 | - feed = doc.createElementNS(atomuri, "feed") | |
114 | - feed.setAttribute("xmlns", atomuri) | |
115 | - title = doc.createElementNS(atomuri, "title") | |
116 | - title.appendChild(doc.createTextNode("OpenLayers Examples")) | |
117 | - feed.appendChild(title) | |
118 | - link = doc.createElementNS(atomuri, "link") | |
119 | - link.setAttribute("rel", "self") | |
120 | - link.setAttribute("href", feedPath + feedName) | |
121 | - | |
122 | - modtime = time.strftime("%Y-%m-%dT%I:%M:%SZ", time.gmtime()) | |
123 | - id = doc.createElementNS(atomuri, "id") | |
124 | - id.appendChild(doc.createTextNode("%s%s#%s" % (feedPath, feedName, modtime))) | |
125 | - feed.appendChild(id) | |
126 | - | |
127 | - updated = doc.createElementNS(atomuri, "updated") | |
128 | - updated.appendChild(doc.createTextNode(modtime)) | |
129 | - feed.appendChild(updated) | |
130 | - | |
131 | - examples.sort(key=lambda x:x["modified"]) | |
132 | - for example in sorted(examples, key=lambda x:x["modified"], reverse=True): | |
133 | - entry = doc.createElementNS(atomuri, "entry") | |
134 | - | |
135 | - title = doc.createElementNS(atomuri, "title") | |
136 | - title.appendChild(doc.createTextNode(example["title"] or example["example"])) | |
137 | - entry.appendChild(title) | |
138 | - | |
139 | - link = doc.createElementNS(atomuri, "link") | |
140 | - link.setAttribute("href", "%s%s" % (feedPath, example["example"])) | |
141 | - entry.appendChild(link) | |
142 | - | |
143 | - summary = doc.createElementNS(atomuri, "summary") | |
144 | - summary.appendChild(doc.createTextNode(example["shortdesc"] or example["example"])) | |
145 | - entry.appendChild(summary) | |
146 | - | |
147 | - updated = doc.createElementNS(atomuri, "updated") | |
148 | - updated.appendChild(doc.createTextNode(example["modified"])) | |
149 | - entry.appendChild(updated) | |
150 | - | |
151 | - author = doc.createElementNS(atomuri, "author") | |
152 | - name = doc.createElementNS(atomuri, "name") | |
153 | - name.appendChild(doc.createTextNode(example["author"])) | |
154 | - author.appendChild(name) | |
155 | - entry.appendChild(author) | |
156 | - | |
157 | - id = doc.createElementNS(atomuri, "id") | |
158 | - id.appendChild(doc.createTextNode("%s%s#%s" % (feedPath, example["example"], example["modified"]))) | |
159 | - entry.appendChild(id) | |
160 | - | |
161 | - feed.appendChild(entry) | |
162 | - | |
163 | - doc.appendChild(feed) | |
164 | - return doc | |
165 | - | |
166 | -def wordIndex(examples): | |
167 | - """ | |
168 | - Create an inverted index based on words in title and shortdesc. Keys are | |
169 | - lower cased words. Values are dictionaries with example index keys and | |
170 | - count values. | |
171 | - """ | |
172 | - index = {} | |
173 | - unword = re.compile("\\W+") | |
174 | - keys = ["shortdesc", "title"] | |
175 | - for i in range(len(examples)): | |
176 | - for key in keys: | |
177 | - text = examples[i][key] | |
178 | - if text: | |
179 | - words = unword.split(text) | |
180 | - for word in words: | |
181 | - if word: | |
182 | - word = word.lower() | |
183 | - if index.has_key(word): | |
184 | - if index[word].has_key(i): | |
185 | - index[word][i] += 1 | |
186 | - else: | |
187 | - index[word][i] = 1 | |
188 | - else: | |
189 | - index[word] = {i: 1} | |
190 | - return index | |
191 | - | |
192 | -if __name__ == "__main__": | |
193 | - | |
194 | - if missing_deps: | |
195 | - print "This script requires simplejson and BeautifulSoup. You don't have them. \n(%s)" % E | |
196 | - sys.exit() | |
197 | - | |
198 | - if len(sys.argv) > 1: | |
199 | - outFile = open(sys.argv[1],'w') | |
200 | - else: | |
201 | - outFile = open('../examples/example-list.js','w') | |
202 | - | |
203 | - examplesLocation = '../examples' | |
204 | - print 'Reading examples from %s and writing out to %s' % (examplesLocation, outFile.name) | |
205 | - | |
206 | - exampleList = [] | |
207 | - docIds = ['title','shortdesc'] | |
208 | - | |
209 | - #comment out option to create docs from online resource | |
210 | - #examplesLocation = 'http://svn.openlayers.org/sandbox/docs/examples/' | |
211 | - #examples = getListOfOnlineExamples(examplesLocation) | |
212 | - | |
213 | - examples = getListOfExamples(examplesLocation) | |
214 | - | |
215 | - modtime = time.strftime("%Y-%m-%dT%I:%M:%SZ", time.gmtime()) | |
216 | - | |
217 | - for example in examples: | |
218 | - url = os.path.join(examplesLocation,example) | |
219 | - html = getExampleHtml(url) | |
220 | - tagvalues = parseHtml(html,docIds) | |
221 | - tagvalues['example'] = example | |
222 | - # add in svn info | |
223 | - d = getSvnInfo(url) | |
224 | - tagvalues["modified"] = d["date"] or modtime | |
225 | - tagvalues["author"] = d["author"] or "anonymous" | |
226 | - tagvalues['link'] = example | |
227 | - | |
228 | - exampleList.append(tagvalues) | |
229 | - | |
230 | ||
231 | - | |
232 | - exampleList.sort(key=lambda x:x['example'].lower()) | |
233 | - | |
234 | - index = wordIndex(exampleList) | |
235 | - | |
236 | - json = simplejson.dumps({"examples": exampleList, "index": index}) | |
237 | - #give the json a global variable we can use in our js. This should be replaced or made optional. | |
238 | - json = 'var info=' + json | |
239 | - outFile.write(json) | |
240 | - outFile.close() | |
241 | - | |
242 | - print "writing feed to ../examples/%s " % feedName | |
243 | - atom = open('../examples/%s' % feedName, 'w') | |
244 | - doc = createFeed(exampleList) | |
245 | - atom.write(doc.toxml()) | |
246 | - atom.close() | |
247 | - | |
248 | - | |
249 | - print 'complete' | |
250 | - | |
251 | - |
pacotes/openlayers/tools/jsmin.c
... | ... | @@ -1,272 +0,0 @@ |
1 | -/* jsmin.c | |
2 | - 2006-05-04 | |
3 | - | |
4 | -Copyright (c) 2002 Douglas Crockford (www.crockford.com) | |
5 | - | |
6 | -Permission is hereby granted, free of charge, to any person obtaining a copy of | |
7 | -this software and associated documentation files (the "Software"), to deal in | |
8 | -the Software without restriction, including without limitation the rights to | |
9 | -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
10 | -of the Software, and to permit persons to whom the Software is furnished to do | |
11 | -so, subject to the following conditions: | |
12 | - | |
13 | -The above copyright notice and this permission notice shall be included in all | |
14 | -copies or substantial portions of the Software. | |
15 | - | |
16 | -The Software shall be used for Good, not Evil. | |
17 | - | |
18 | -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 | -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 | -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
21 | -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 | -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
23 | -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
24 | -SOFTWARE. | |
25 | -*/ | |
26 | - | |
27 | -#include <stdlib.h> | |
28 | -#include <stdio.h> | |
29 | - | |
30 | -static int theA; | |
31 | -static int theB; | |
32 | -static int theLookahead = EOF; | |
33 | - | |
34 | - | |
35 | -/* isAlphanum -- return true if the character is a letter, digit, underscore, | |
36 | - dollar sign, or non-ASCII character. | |
37 | -*/ | |
38 | - | |
39 | -static int | |
40 | -isAlphanum(int c) | |
41 | -{ | |
42 | - return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || | |
43 | - (c >= 'A' && c <= 'Z') || c == '_' || c == '$' || c == '\\' || | |
44 | - c > 126); | |
45 | -} | |
46 | - | |
47 | - | |
48 | -/* get -- return the next character from stdin. Watch out for lookahead. If | |
49 | - the character is a control character, translate it to a space or | |
50 | - linefeed. | |
51 | -*/ | |
52 | - | |
53 | -static int | |
54 | -get() | |
55 | -{ | |
56 | - int c = theLookahead; | |
57 | - theLookahead = EOF; | |
58 | - if (c == EOF) { | |
59 | - c = getc(stdin); | |
60 | - } | |
61 | - if (c >= ' ' || c == '\n' || c == EOF) { | |
62 | - return c; | |
63 | - } | |
64 | - if (c == '\r') { | |
65 | - return '\n'; | |
66 | - } | |
67 | - return ' '; | |
68 | -} | |
69 | - | |
70 | - | |
71 | -/* peek -- get the next character without getting it. | |
72 | -*/ | |
73 | - | |
74 | -static int | |
75 | -peek() | |
76 | -{ | |
77 | - theLookahead = get(); | |
78 | - return theLookahead; | |
79 | -} | |
80 | - | |
81 | - | |
82 | -/* next -- get the next character, excluding comments. peek() is used to see | |
83 | - if a '/' is followed by a '/' or '*'. | |
84 | -*/ | |
85 | - | |
86 | -static int | |
87 | -next() | |
88 | -{ | |
89 | - int c = get(); | |
90 | - if (c == '/') { | |
91 | - switch (peek()) { | |
92 | - case '/': | |
93 | - for (;;) { | |
94 | - c = get(); | |
95 | - if (c <= '\n') { | |
96 | - return c; | |
97 | - } | |
98 | - } | |
99 | - case '*': | |
100 | - get(); | |
101 | - for (;;) { | |
102 | - switch (get()) { | |
103 | - case '*': | |
104 | - if (peek() == '/') { | |
105 | - get(); | |
106 | - return ' '; | |
107 | - } | |
108 | - break; | |
109 | - case EOF: | |
110 | - fprintf(stderr, "Error: JSMIN Unterminated comment.\n"); | |
111 | - exit(1); | |
112 | - } | |
113 | - } | |
114 | - default: | |
115 | - return c; | |
116 | - } | |
117 | - } | |
118 | - return c; | |
119 | -} | |
120 | - | |
121 | - | |
122 | -/* action -- do something! What you do is determined by the argument: | |
123 | - 1 Output A. Copy B to A. Get the next B. | |
124 | - 2 Copy B to A. Get the next B. (Delete A). | |
125 | - 3 Get the next B. (Delete B). | |
126 | - action treats a string as a single character. Wow! | |
127 | - action recognizes a regular expression if it is preceded by ( or , or =. | |
128 | -*/ | |
129 | - | |
130 | -static void | |
131 | -action(int d) | |
132 | -{ | |
133 | - switch (d) { | |
134 | - case 1: | |
135 | - putc(theA, stdout); | |
136 | - case 2: | |
137 | - theA = theB; | |
138 | - if (theA == '\'' || theA == '"') { | |
139 | - for (;;) { | |
140 | - putc(theA, stdout); | |
141 | - theA = get(); | |
142 | - if (theA == theB) { | |
143 | - break; | |
144 | - } | |
145 | - if (theA <= '\n') { | |
146 | - fprintf(stderr, | |
147 | -"Error: JSMIN unterminated string literal: %c\n", theA); | |
148 | - exit(1); | |
149 | - } | |
150 | - if (theA == '\\') { | |
151 | - putc(theA, stdout); | |
152 | - theA = get(); | |
153 | - } | |
154 | - } | |
155 | - } | |
156 | - case 3: | |
157 | - theB = next(); | |
158 | - if (theB == '/' && (theA == '(' || theA == ',' || theA == '=' || | |
159 | - theA == ':' || theA == '[' || theA == '!' || theA == '&' || | |
160 | - theA == '|')) { | |
161 | - putc(theA, stdout); | |
162 | - putc(theB, stdout); | |
163 | - for (;;) { | |
164 | - theA = get(); | |
165 | - if (theA == '/') { | |
166 | - break; | |
167 | - } else if (theA =='\\') { | |
168 | - putc(theA, stdout); | |
169 | - theA = get(); | |
170 | - } else if (theA <= '\n') { | |
171 | - fprintf(stderr, | |
172 | -"Error: JSMIN unterminated Regular Expression literal.\n", theA); | |
173 | - exit(1); | |
174 | - } | |
175 | - putc(theA, stdout); | |
176 | - } | |
177 | - theB = next(); | |
178 | - } | |
179 | - } | |
180 | -} | |
181 | - | |
182 | - | |
183 | -/* jsmin -- Copy the input to the output, deleting the characters which are | |
184 | - insignificant to JavaScript. Comments will be removed. Tabs will be | |
185 | - replaced with spaces. Carriage returns will be replaced with linefeeds. | |
186 | - Most spaces and linefeeds will be removed. | |
187 | -*/ | |
188 | - | |
189 | -static void | |
190 | -jsmin() | |
191 | -{ | |
192 | - theA = '\n'; | |
193 | - action(3); | |
194 | - while (theA != EOF) { | |
195 | - switch (theA) { | |
196 | - case ' ': | |
197 | - if (isAlphanum(theB)) { | |
198 | - action(1); | |
199 | - } else { | |
200 | - action(2); | |
201 | - } | |
202 | - break; | |
203 | - case '\n': | |
204 | - switch (theB) { | |
205 | - case '{': | |
206 | - case '[': | |
207 | - case '(': | |
208 | - case '+': | |
209 | - case '-': | |
210 | - action(1); | |
211 | - break; | |
212 | - case ' ': | |
213 | - action(3); | |
214 | - break; | |
215 | - default: | |
216 | - if (isAlphanum(theB)) { | |
217 | - action(1); | |
218 | - } else { | |
219 | - action(2); | |
220 | - } | |
221 | - } | |
222 | - break; | |
223 | - default: | |
224 | - switch (theB) { | |
225 | - case ' ': | |
226 | - if (isAlphanum(theA)) { | |
227 | - action(1); | |
228 | - break; | |
229 | - } | |
230 | - action(3); | |
231 | - break; | |
232 | - case '\n': | |
233 | - switch (theA) { | |
234 | - case '}': | |
235 | - case ']': | |
236 | - case ')': | |
237 | - case '+': | |
238 | - case '-': | |
239 | - case '"': | |
240 | - case '\'': | |
241 | - action(1); | |
242 | - break; | |
243 | - default: | |
244 | - if (isAlphanum(theA)) { | |
245 | - action(1); | |
246 | - } else { | |
247 | - action(3); | |
248 | - } | |
249 | - } | |
250 | - break; | |
251 | - default: | |
252 | - action(1); | |
253 | - break; | |
254 | - } | |
255 | - } | |
256 | - } | |
257 | -} | |
258 | - | |
259 | - | |
260 | -/* main -- Output any command line arguments as comments | |
261 | - and then minify the input. | |
262 | -*/ | |
263 | -extern int | |
264 | -main(int argc, char* argv[]) | |
265 | -{ | |
266 | - int i; | |
267 | - for (i = 1; i < argc; i += 1) { | |
268 | - fprintf(stdout, "// %s\n", argv[i]); | |
269 | - } | |
270 | - jsmin(); | |
271 | - return 0; | |
272 | -} |
pacotes/openlayers/tools/jsmin.py
... | ... | @@ -1,216 +0,0 @@ |
1 | -#!/usr/bin/python | |
2 | - | |
3 | -# This code is original from jsmin by Douglas Crockford, it was translated to | |
4 | -# Python by Baruch Even. The original code had the following copyright and | |
5 | -# license. | |
6 | -# | |
7 | -# /* jsmin.c | |
8 | -# 2007-01-08 | |
9 | -# | |
10 | -# Copyright (c) 2002 Douglas Crockford (www.crockford.com) | |
11 | -# | |
12 | -# Permission is hereby granted, free of charge, to any person obtaining a copy of | |
13 | -# this software and associated documentation files (the "Software"), to deal in | |
14 | -# the Software without restriction, including without limitation the rights to | |
15 | -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
16 | -# of the Software, and to permit persons to whom the Software is furnished to do | |
17 | -# so, subject to the following conditions: | |
18 | -# | |
19 | -# The above copyright notice and this permission notice shall be included in all | |
20 | -# copies or substantial portions of the Software. | |
21 | -# | |
22 | -# The Software shall be used for Good, not Evil. | |
23 | -# | |
24 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
25 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
26 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
27 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
28 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
29 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
30 | -# SOFTWARE. | |
31 | -# */ | |
32 | - | |
33 | -from StringIO import StringIO | |
34 | - | |
35 | -def jsmin(js): | |
36 | - ins = StringIO(js) | |
37 | - outs = StringIO() | |
38 | - JavascriptMinify().minify(ins, outs) | |
39 | - str = outs.getvalue() | |
40 | - if len(str) > 0 and str[0] == '\n': | |
41 | - str = str[1:] | |
42 | - return str | |
43 | - | |
44 | -def isAlphanum(c): | |
45 | - """return true if the character is a letter, digit, underscore, | |
46 | - dollar sign, or non-ASCII character. | |
47 | - """ | |
48 | - return ((c >= 'a' and c <= 'z') or (c >= '0' and c <= '9') or | |
49 | - (c >= 'A' and c <= 'Z') or c == '_' or c == '$' or c == '\\' or (c is not None and ord(c) > 126)); | |
50 | - | |
51 | -class UnterminatedComment(Exception): | |
52 | - pass | |
53 | - | |
54 | -class UnterminatedStringLiteral(Exception): | |
55 | - pass | |
56 | - | |
57 | -class UnterminatedRegularExpression(Exception): | |
58 | - pass | |
59 | - | |
60 | -class JavascriptMinify(object): | |
61 | - | |
62 | - def _outA(self): | |
63 | - self.outstream.write(self.theA) | |
64 | - def _outB(self): | |
65 | - self.outstream.write(self.theB) | |
66 | - | |
67 | - def _get(self): | |
68 | - """return the next character from stdin. Watch out for lookahead. If | |
69 | - the character is a control character, translate it to a space or | |
70 | - linefeed. | |
71 | - """ | |
72 | - c = self.theLookahead | |
73 | - self.theLookahead = None | |
74 | - if c == None: | |
75 | - c = self.instream.read(1) | |
76 | - if c >= ' ' or c == '\n': | |
77 | - return c | |
78 | - if c == '': # EOF | |
79 | - return '\000' | |
80 | - if c == '\r': | |
81 | - return '\n' | |
82 | - return ' ' | |
83 | - | |
84 | - def _peek(self): | |
85 | - self.theLookahead = self._get() | |
86 | - return self.theLookahead | |
87 | - | |
88 | - def _next(self): | |
89 | - """get the next character, excluding comments. peek() is used to see | |
90 | - if a '/' is followed by a '/' or '*'. | |
91 | - """ | |
92 | - c = self._get() | |
93 | - if c == '/': | |
94 | - p = self._peek() | |
95 | - if p == '/': | |
96 | - c = self._get() | |
97 | - while c > '\n': | |
98 | - c = self._get() | |
99 | - return c | |
100 | - if p == '*': | |
101 | - c = self._get() | |
102 | - while 1: | |
103 | - c = self._get() | |
104 | - if c == '*': | |
105 | - if self._peek() == '/': | |
106 | - self._get() | |
107 | - return ' ' | |
108 | - if c == '\000': | |
109 | - raise UnterminatedComment() | |
110 | - | |
111 | - return c | |
112 | - | |
113 | - def _action(self, action): | |
114 | - """do something! What you do is determined by the argument: | |
115 | - 1 Output A. Copy B to A. Get the next B. | |
116 | - 2 Copy B to A. Get the next B. (Delete A). | |
117 | - 3 Get the next B. (Delete B). | |
118 | - action treats a string as a single character. Wow! | |
119 | - action recognizes a regular expression if it is preceded by ( or , or =. | |
120 | - """ | |
121 | - if action <= 1: | |
122 | - self._outA() | |
123 | - | |
124 | - if action <= 2: | |
125 | - self.theA = self.theB | |
126 | - if self.theA == "'" or self.theA == '"': | |
127 | - while 1: | |
128 | - self._outA() | |
129 | - self.theA = self._get() | |
130 | - if self.theA == self.theB: | |
131 | - break | |
132 | - if self.theA <= '\n': | |
133 | - raise UnterminatedStringLiteral() | |
134 | - if self.theA == '\\': | |
135 | - self._outA() | |
136 | - self.theA = self._get() | |
137 | - | |
138 | - | |
139 | - if action <= 3: | |
140 | - self.theB = self._next() | |
141 | - if self.theB == '/' and (self.theA == '(' or self.theA == ',' or | |
142 | - self.theA == '=' or self.theA == ':' or | |
143 | - self.theA == '[' or self.theA == '?' or | |
144 | - self.theA == '!' or self.theA == '&' or | |
145 | - self.theA == '|'): | |
146 | - self._outA() | |
147 | - self._outB() | |
148 | - while 1: | |
149 | - self.theA = self._get() | |
150 | - if self.theA == '/': | |
151 | - break | |
152 | - elif self.theA == '\\': | |
153 | - self._outA() | |
154 | - self.theA = self._get() | |
155 | - elif self.theA <= '\n': | |
156 | - raise UnterminatedRegularExpression() | |
157 | - self._outA() | |
158 | - self.theB = self._next() | |
159 | - | |
160 | - | |
161 | - def _jsmin(self): | |
162 | - """Copy the input to the output, deleting the characters which are | |
163 | - insignificant to JavaScript. Comments will be removed. Tabs will be | |
164 | - replaced with spaces. Carriage returns will be replaced with linefeeds. | |
165 | - Most spaces and linefeeds will be removed. | |
166 | - """ | |
167 | - self.theA = '\n' | |
168 | - self._action(3) | |
169 | - | |
170 | - while self.theA != '\000': | |
171 | - if self.theA == ' ': | |
172 | - if isAlphanum(self.theB): | |
173 | - self._action(1) | |
174 | - else: | |
175 | - self._action(2) | |
176 | - elif self.theA == '\n': | |
177 | - if self.theB in ['{', '[', '(', '+', '-']: | |
178 | - self._action(1) | |
179 | - elif self.theB == ' ': | |
180 | - self._action(3) | |
181 | - else: | |
182 | - if isAlphanum(self.theB): | |
183 | - self._action(1) | |
184 | - else: | |
185 | - self._action(2) | |
186 | - else: | |
187 | - if self.theB == ' ': | |
188 | - if isAlphanum(self.theA): | |
189 | - self._action(1) | |
190 | - else: | |
191 | - self._action(3) | |
192 | - elif self.theB == '\n': | |
193 | - if self.theA in ['}', ']', ')', '+', '-', '"', '\'']: | |
194 | - self._action(1) | |
195 | - else: | |
196 | - if isAlphanum(self.theA): | |
197 | - self._action(1) | |
198 | - else: | |
199 | - self._action(3) | |
200 | - else: | |
201 | - self._action(1) | |
202 | - | |
203 | - def minify(self, instream, outstream): | |
204 | - self.instream = instream | |
205 | - self.outstream = outstream | |
206 | - self.theA = None | |
207 | - self.thaB = None | |
208 | - self.theLookahead = None | |
209 | - | |
210 | - self._jsmin() | |
211 | - self.instream.close() | |
212 | - | |
213 | -if __name__ == '__main__': | |
214 | - import sys | |
215 | - jsm = JavascriptMinify() | |
216 | - jsm.minify(sys.stdin, sys.stdout) |
pacotes/openlayers/tools/mergejs.py
... | ... | @@ -1,252 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | -# | |
3 | -# Merge multiple JavaScript source code files into one. | |
4 | -# | |
5 | -# Usage: | |
6 | -# This script requires source files to have dependencies specified in them. | |
7 | -# | |
8 | -# Dependencies are specified with a comment of the form: | |
9 | -# | |
10 | -# // @requires <file path> | |
11 | -# | |
12 | -# e.g. | |
13 | -# | |
14 | -# // @requires Geo/DataSource.js | |
15 | -# | |
16 | -# This script should be executed like so: | |
17 | -# | |
18 | -# mergejs.py <output.js> <directory> [...] | |
19 | -# | |
20 | -# e.g. | |
21 | -# | |
22 | -# mergejs.py openlayers.js Geo/ CrossBrowser/ | |
23 | -# | |
24 | -# This example will cause the script to walk the `Geo` and | |
25 | -# `CrossBrowser` directories--and subdirectories thereof--and import | |
26 | -# all `*.js` files encountered. The dependency declarations will be extracted | |
27 | -# and then the source code from imported files will be output to | |
28 | -# a file named `openlayers.js` in an order which fulfils the dependencies | |
29 | -# specified. | |
30 | -# | |
31 | -# | |
32 | -# Note: This is a very rough initial version of this code. | |
33 | -# | |
34 | -# -- Copyright 2005-2008 MetaCarta, Inc. / OpenLayers project -- | |
35 | -# | |
36 | - | |
37 | -# TODO: Allow files to be excluded. e.g. `Crossbrowser/DebugMode.js`? | |
38 | -# TODO: Report error when dependency can not be found rather than KeyError. | |
39 | - | |
40 | -import re | |
41 | -import os | |
42 | -import sys | |
43 | - | |
44 | -SUFFIX_JAVASCRIPT = ".js" | |
45 | - | |
46 | -RE_REQUIRE = "@requires:? (.*)\n" # TODO: Ensure in comment? | |
47 | -class SourceFile: | |
48 | - """ | |
49 | - Represents a Javascript source code file. | |
50 | - """ | |
51 | - | |
52 | - def __init__(self, filepath, source): | |
53 | - """ | |
54 | - """ | |
55 | - self.filepath = filepath | |
56 | - self.source = source | |
57 | - | |
58 | - self.requiredBy = [] | |
59 | - | |
60 | - | |
61 | - def _getRequirements(self): | |
62 | - """ | |
63 | - Extracts the dependencies specified in the source code and returns | |
64 | - a list of them. | |
65 | - """ | |
66 | - # TODO: Cache? | |
67 | - return re.findall(RE_REQUIRE, self.source) | |
68 | - | |
69 | - requires = property(fget=_getRequirements, doc="") | |
70 | - | |
71 | - | |
72 | - | |
73 | -def usage(filename): | |
74 | - """ | |
75 | - Displays a usage message. | |
76 | - """ | |
77 | - print "%s [-c <config file>] <output.js> <directory> [...]" % filename | |
78 | - | |
79 | - | |
80 | -class Config: | |
81 | - """ | |
82 | - Represents a parsed configuration file. | |
83 | - | |
84 | - A configuration file should be of the following form: | |
85 | - | |
86 | - [first] | |
87 | - 3rd/prototype.js | |
88 | - core/application.js | |
89 | - core/params.js | |
90 | - # A comment | |
91 | - | |
92 | - [last] | |
93 | - core/api.js # Another comment | |
94 | - | |
95 | - [exclude] | |
96 | - 3rd/logger.js | |
97 | - | |
98 | - All headings are required. | |
99 | - | |
100 | - The files listed in the `first` section will be forced to load | |
101 | - *before* all other files (in the order listed). The files in `last` | |
102 | - section will be forced to load *after* all the other files (in the | |
103 | - order listed). | |
104 | - | |
105 | - The files list in the `exclude` section will not be imported. | |
106 | - | |
107 | - Any text appearing after a # symbol indicates a comment. | |
108 | - | |
109 | - """ | |
110 | - | |
111 | - def __init__(self, filename): | |
112 | - """ | |
113 | - Parses the content of the named file and stores the values. | |
114 | - """ | |
115 | - lines = [re.sub("#.*?$", "", line).strip() # Assumes end-of-line character is present | |
116 | - for line in open(filename) | |
117 | - if line.strip() and not line.strip().startswith("#")] # Skip blank lines and comments | |
118 | - | |
119 | - self.forceFirst = lines[lines.index("[first]") + 1:lines.index("[last]")] | |
120 | - | |
121 | - self.forceLast = lines[lines.index("[last]") + 1:lines.index("[include]")] | |
122 | - self.include = lines[lines.index("[include]") + 1:lines.index("[exclude]")] | |
123 | - self.exclude = lines[lines.index("[exclude]") + 1:] | |
124 | - | |
125 | -def run (sourceDirectory, outputFilename = None, configFile = None): | |
126 | - cfg = None | |
127 | - if configFile: | |
128 | - cfg = Config(configFile) | |
129 | - | |
130 | - allFiles = [] | |
131 | - | |
132 | - ## Find all the Javascript source files | |
133 | - for root, dirs, files in os.walk(sourceDirectory): | |
134 | - for filename in files: | |
135 | - if filename.endswith(SUFFIX_JAVASCRIPT) and not filename.startswith("."): | |
136 | - filepath = os.path.join(root, filename)[len(sourceDirectory)+1:] | |
137 | - filepath = filepath.replace("\\", "/") | |
138 | - if cfg and cfg.include: | |
139 | - if filepath in cfg.include or filepath in cfg.forceFirst: | |
140 | - allFiles.append(filepath) | |
141 | - elif (not cfg) or (filepath not in cfg.exclude): | |
142 | - allFiles.append(filepath) | |
143 | - | |
144 | - ## Header inserted at the start of each file in the output | |
145 | - HEADER = "/* " + "=" * 70 + "\n %s\n" + " " + "=" * 70 + " */\n\n" | |
146 | - | |
147 | - files = {} | |
148 | - | |
149 | - order = [] # List of filepaths to output, in a dependency satisfying order | |
150 | - | |
151 | - ## Import file source code | |
152 | - ## TODO: Do import when we walk the directories above? | |
153 | - for filepath in allFiles: | |
154 | - print "Importing: %s" % filepath | |
155 | - fullpath = os.path.join(sourceDirectory, filepath).strip() | |
156 | - content = open(fullpath, "U").read() # TODO: Ensure end of line @ EOF? | |
157 | - files[filepath] = SourceFile(filepath, content) # TODO: Chop path? | |
158 | - | |
159 | ||
160 | - | |
161 | - from toposort import toposort | |
162 | - | |
163 | - complete = False | |
164 | - resolution_pass = 1 | |
165 | - | |
166 | - while not complete: | |
167 | - order = [] # List of filepaths to output, in a dependency satisfying order | |
168 | - nodes = [] | |
169 | - routes = [] | |
170 | - ## Resolve the dependencies | |
171 | - print "Resolution pass %s... " % resolution_pass | |
172 | - resolution_pass += 1 | |
173 | - | |
174 | - for filepath, info in files.items(): | |
175 | - nodes.append(filepath) | |
176 | - for neededFilePath in info.requires: | |
177 | - routes.append((neededFilePath, filepath)) | |
178 | - | |
179 | - for dependencyLevel in toposort(nodes, routes): | |
180 | - for filepath in dependencyLevel: | |
181 | - order.append(filepath) | |
182 | - if not files.has_key(filepath): | |
183 | - print "Importing: %s" % filepath | |
184 | - fullpath = os.path.join(sourceDirectory, filepath).strip() | |
185 | - content = open(fullpath, "U").read() # TODO: Ensure end of line @ EOF? | |
186 | - files[filepath] = SourceFile(filepath, content) # TODO: Chop path? | |
187 | - | |
188 | - | |
189 | - | |
190 | - # Double check all dependencies have been met | |
191 | - complete = True | |
192 | - try: | |
193 | - for fp in order: | |
194 | - if max([order.index(rfp) for rfp in files[fp].requires] + | |
195 | - [order.index(fp)]) != order.index(fp): | |
196 | - complete = False | |
197 | - except: | |
198 | - complete = False | |
199 | - | |
200 | ||
201 | - | |
202 | - | |
203 | - ## Move forced first and last files to the required position | |
204 | - if cfg: | |
205 | - print "Re-ordering files..." | |
206 | - order = cfg.forceFirst + [item | |
207 | - for item in order | |
208 | - if ((item not in cfg.forceFirst) and | |
209 | - (item not in cfg.forceLast))] + cfg.forceLast | |
210 | - | |
211 | ||
212 | - ## Output the files in the determined order | |
213 | - result = [] | |
214 | - | |
215 | - for fp in order: | |
216 | - f = files[fp] | |
217 | - print "Exporting: ", f.filepath | |
218 | - result.append(HEADER % f.filepath) | |
219 | - source = f.source | |
220 | - result.append(source) | |
221 | - if not source.endswith("\n"): | |
222 | - result.append("\n") | |
223 | - | |
224 | - print "\nTotal files merged: %d " % len(files) | |
225 | - | |
226 | - if outputFilename: | |
227 | - print "\nGenerating: %s" % (outputFilename) | |
228 | - open(outputFilename, "w").write("".join(result)) | |
229 | - return "".join(result) | |
230 | - | |
231 | -if __name__ == "__main__": | |
232 | - import getopt | |
233 | - | |
234 | - options, args = getopt.getopt(sys.argv[1:], "-c:") | |
235 | - | |
236 | - try: | |
237 | - outputFilename = args[0] | |
238 | - except IndexError: | |
239 | - usage(sys.argv[0]) | |
240 | - raise SystemExit | |
241 | - else: | |
242 | - sourceDirectory = args[1] | |
243 | - if not sourceDirectory: | |
244 | - usage(sys.argv[0]) | |
245 | - raise SystemExit | |
246 | - | |
247 | - configFile = None | |
248 | - if options and options[0][0] == "-c": | |
249 | - configFile = options[0][1] | |
250 | - print "Parsing configuration file: %s" % filename | |
251 | - | |
252 | - run( sourceDirectory, outputFilename, configFile ) |
pacotes/openlayers/tools/minimize.py
... | ... | @@ -1,47 +0,0 @@ |
1 | -# Minimal Python Minimizer | |
2 | -# Copyright 2008, Christopher Schmidt | |
3 | -# Released under the MIT License | |
4 | -# | |
5 | -# Taken from: http://svn.crschmidt.net/personal/python/minimize.py | |
6 | -# $Id: minimize.py 6 2008-01-03 06:33:35Z crschmidt $ | |
7 | -# | |
8 | -# Permission is hereby granted, free of charge, to any person obtaining a copy | |
9 | -# of this software and associated documentation files (the "Software"), to deal | |
10 | -# in the Software without restriction, including without limitation the rights | |
11 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
12 | -# copies of the Software, and to permit persons to whom the Software is | |
13 | -# furnished to do so, subject to the following conditions: | |
14 | -# | |
15 | -# The above copyright notice and this permission notice shall be included in | |
16 | -# all copies or substantial portions of the Software. | |
17 | -# | |
18 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
21 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
23 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
24 | -# THE SOFTWARE. | |
25 | - | |
26 | -import re | |
27 | - | |
28 | -def strip_comments_helper(data): | |
29 | - """remove all /* */ format comments and surrounding whitespace.""" | |
30 | - p = re.compile(r'[\s]*/\*.*?\*/[\s]*', re.DOTALL) | |
31 | - return p.sub('',data) | |
32 | - | |
33 | -def minimize(data, exclude=None): | |
34 | - """Central function call. This will call all other compression | |
35 | - functions. To add further compression algorithms, simply add | |
36 | - functions whose names end in _helper which take a string as input | |
37 | - and return a more compressed string as output.""" | |
38 | - for key, item in globals().iteritems(): | |
39 | - if key.endswith("_helper"): | |
40 | - func_key = key[:-7] | |
41 | - if not exclude or not func_key in exclude: | |
42 | - data = item(data) | |
43 | - return data | |
44 | - | |
45 | -if __name__ == "__main__": | |
46 | - import sys | |
47 | - print minimize(open(sys.argv[1]).read()) |
pacotes/openlayers/tools/oldot.py
... | ... | @@ -1,43 +0,0 @@ |
1 | -import re | |
2 | -import os | |
3 | -def run(): | |
4 | - sourceDirectory = "../lib/OpenLayers" | |
5 | - allFiles = [] | |
6 | - SUFFIX_JAVASCRIPT = ".js" | |
7 | - ## Find all the Javascript source files | |
8 | - for root, dirs, files in os.walk(sourceDirectory): | |
9 | - for filename in files: | |
10 | - if filename.endswith(SUFFIX_JAVASCRIPT) and not filename.startswith("."): | |
11 | - filepath = os.path.join(root, filename)[len(sourceDirectory)+1:] | |
12 | - filepath = filepath.replace("\\", "/") | |
13 | - data = open(os.path.join(sourceDirectory, filepath)).read() | |
14 | - parents = re.search("OpenLayers.Class\((.*?){", data, | |
15 | - re.DOTALL) | |
16 | - if parents: | |
17 | - parents = [x.strip() for x in parents.group(1).strip().strip(",").split(",")] | |
18 | - else: | |
19 | - parents = [] | |
20 | - cls = "OpenLayers.%s" % filepath.strip(".js").replace("/", ".") | |
21 | - allFiles.append([cls, parents]) | |
22 | - return allFiles | |
23 | -print """ | |
24 | -digraph name { | |
25 | - fontname = "Helvetica" | |
26 | - fontsize = 8 | |
27 | - K = 0.6 | |
28 | - | |
29 | - node [ | |
30 | - fontname = "Helvetica" | |
31 | - fontsize = 8 | |
32 | - shape = "plaintext" | |
33 | - ] | |
34 | -""" | |
35 | - | |
36 | -for i in run(): | |
37 | - print i[0].replace(".", "_") | |
38 | - for item in i[1]: | |
39 | - if not item: continue | |
40 | - print "%s -> %s" % (i[0].replace(".","_"), item.replace(".", "_")) | |
41 | - print "; " | |
42 | - | |
43 | -print """}""" |
pacotes/openlayers/tools/release.sh
... | ... | @@ -1,29 +0,0 @@ |
1 | -#!/bin/sh | |
2 | - | |
3 | -VERSION=$1 | |
4 | - | |
5 | -svn export http://svn.openlayers.org/tags/openlayers/release-$VERSION OpenLayers-$VERSION | |
6 | -cd OpenLayers-$VERSION/build | |
7 | -./build.py full | |
8 | -cp OpenLayers.js .. | |
9 | - | |
10 | -cd .. | |
11 | - | |
12 | -mkdir doc/devdocs | |
13 | -mkdir doc/apidocs | |
14 | -rm tools/*.pyc | |
15 | - | |
16 | -mkdir /www/openlayers/htdocs/api/$VERSION | |
17 | -cp OpenLayers.js /www/openlayers/htdocs/api/$VERSION | |
18 | -cp -a img/ /www/openlayers/htdocs/api/$VERSION | |
19 | -cp -a theme/ /www/openlayers/htdocs/api/$VERSION | |
20 | - | |
21 | -cd .. | |
22 | - | |
23 | -~/nd/NaturalDocs -i OpenLayers-$VERSION/lib -o HTML OpenLayers-$VERSION/doc/devdocs -p OpenLayers-$VERSION/doc_config -s Small OL | |
24 | -~/nd/NaturalDocs -i OpenLayers-$VERSION/lib -o HTML OpenLayers-$VERSION/doc/apidocs -p OpenLayers-$VERSION/apidoc_config -s Small OL | |
25 | - | |
26 | -tar cvfz OpenLayers-$VERSION.tar.gz OpenLayers-$VERSION/ | |
27 | -zip -9r OpenLayers-$VERSION.zip OpenLayers-$VERSION/ | |
28 | - | |
29 | -cp OpenLayers-$VERSION.* /www/openlayers/htdocs/download |
pacotes/openlayers/tools/shrinksafe.py
... | ... | @@ -1,54 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | -# | |
3 | -# Script to provide a wrapper around the ShrinkSafe "web service" | |
4 | -# <http://shrinksafe.dojotoolkit.org/> | |
5 | -# | |
6 | - | |
7 | -# | |
8 | -# We use this script for two reasons: | |
9 | -# | |
10 | -# * This avoids having to install and configure Java and the standalone | |
11 | -# ShrinkSafe utility. | |
12 | -# | |
13 | -# * The current ShrinkSafe standalone utility was broken when we last | |
14 | -# used it. | |
15 | -# | |
16 | - | |
17 | -import sys | |
18 | - | |
19 | -import urllib | |
20 | -import urllib2 | |
21 | - | |
22 | -URL_SHRINK_SAFE = "http://shrinksafe.dojotoolkit.org/shrinksafe.php" | |
23 | - | |
24 | -# This would normally be dynamically generated: | |
25 | -BOUNDARY_MARKER = "---------------------------72288400411964641492083565382" | |
26 | - | |
27 | -if __name__ == "__main__": | |
28 | - ## Grab the source code | |
29 | - try: | |
30 | - sourceFilename = sys.argv[1] | |
31 | - except: | |
32 | - print "Usage: %s (<source filename>|-)" % sys.argv[0] | |
33 | - raise SystemExit | |
34 | - | |
35 | - if sourceFilename == "-": | |
36 | - sourceCode = sys.stdin.read() | |
37 | - sourceFilename = "stdin.js" | |
38 | - else: | |
39 | - sourceCode = open(sourceFilename).read() | |
40 | - | |
41 | - ## Create the request replicating posting of the form from the web page | |
42 | - request = urllib2.Request(url=URL_SHRINK_SAFE) | |
43 | - request.add_header("Content-Type", | |
44 | - "multipart/form-data; boundary=%s" % BOUNDARY_MARKER) | |
45 | - request.add_data(""" | |
46 | ---%s | |
47 | -Content-Disposition: form-data; name="shrinkfile[]"; filename="%s" | |
48 | -Content-Type: application/x-javascript | |
49 | - | |
50 | -%s | |
51 | -""" % (BOUNDARY_MARKER, sourceFilename, sourceCode)) | |
52 | - | |
53 | - ## Deliver the result | |
54 | - print urllib2.urlopen(request).read(), |
pacotes/openlayers/tools/toposort.py
... | ... | @@ -1,260 +0,0 @@ |
1 | -# | |
2 | -# According to <http://www.vrplumber.com/programming/> this file | |
3 | -# is licensed under a BSD-style license. We only use the section | |
4 | -# originally by Tim Peters. | |
5 | -# | |
6 | -# TODO: The use of this code needs to be okayed by someone. | |
7 | -# | |
8 | - | |
9 | -class RecursionError( OverflowError, ValueError ): | |
10 | - '''Unable to calculate result because of recursive structure''' | |
11 | - | |
12 | - | |
13 | -def sort(nodes, routes, noRecursion=1): | |
14 | - '''Passed a list of node IDs and a list of source,dest ID routes | |
15 | - attempt to create a list of stages where each sub list | |
16 | - is one stage in a process. | |
17 | - ''' | |
18 | - children, parents = _buildChildrenLists(routes) | |
19 | - # first stage is those nodes | |
20 | - # having no incoming routes... | |
21 | - stage = [] | |
22 | - stages = [stage] | |
23 | - taken = [] | |
24 | - for node in nodes: | |
25 | - if (not parents.get(node)): | |
26 | - stage.append (node) | |
27 | - if nodes and not stage: | |
28 | - # there is no element which does not depend on | |
29 | - # some other element!!! | |
30 | - stage.append( nodes[0]) | |
31 | - taken.extend( stage ) | |
32 | - nodes = filter ( lambda x, l=stage: x not in l, nodes ) | |
33 | - while nodes: | |
34 | - previousStageChildren = [] | |
35 | - nodelen = len(nodes) | |
36 | - # second stage are those nodes | |
37 | - # which are direct children of the first stage | |
38 | - for node in stage: | |
39 | - for child in children.get (node, []): | |
40 | - if child not in previousStageChildren and child not in taken: | |
41 | - previousStageChildren.append(child) | |
42 | - elif child in taken and noRecursion: | |
43 | - raise RecursionError( (child, node) ) | |
44 | - # unless they are children of other direct children... | |
45 | - # TODO, actually do that... | |
46 | - stage = previousStageChildren | |
47 | - removes = [] | |
48 | - for current in stage: | |
49 | - currentParents = parents.get( current, [] ) | |
50 | - for parent in currentParents: | |
51 | - if parent in stage and parent != current: | |
52 | - # might wind up removing current... | |
53 | - if not current in parents.get(parent, []): | |
54 | - # is not mutually dependent... | |
55 | - removes.append( current ) | |
56 | - for remove in removes: | |
57 | - while remove in stage: | |
58 | - stage.remove( remove ) | |
59 | - stages.append( stage) | |
60 | - taken.extend( stage ) | |
61 | - nodes = filter ( lambda x, l=stage: x not in l, nodes ) | |
62 | - if nodelen == len(nodes): | |
63 | - if noRecursion: | |
64 | - raise RecursionError( nodes ) | |
65 | - else: | |
66 | - stages.append( nodes[:] ) | |
67 | - nodes = [] | |
68 | - return stages | |
69 | - | |
70 | -def _buildChildrenLists (routes): | |
71 | - childrenTable = {} | |
72 | - parentTable = {} | |
73 | - for sourceID,destinationID in routes: | |
74 | - currentChildren = childrenTable.get( sourceID, []) | |
75 | - currentParents = parentTable.get( destinationID, []) | |
76 | - if not destinationID in currentChildren: | |
77 | - currentChildren.append ( destinationID) | |
78 | - if not sourceID in currentParents: | |
79 | - currentParents.append ( sourceID) | |
80 | - childrenTable[sourceID] = currentChildren | |
81 | - parentTable[destinationID] = currentParents | |
82 | - return childrenTable, parentTable | |
83 | - | |
84 | - | |
85 | -def toposort (nodes, routes, noRecursion=1): | |
86 | - '''Topological sort from Tim Peters, fairly efficient | |
87 | - in comparison (it seems).''' | |
88 | - #first calculate the recursion depth | |
89 | - | |
90 | - dependencies = {} | |
91 | - inversedependencies = {} | |
92 | - if not nodes: | |
93 | - return [] | |
94 | - if not routes: | |
95 | - return [nodes] | |
96 | - for node in nodes: | |
97 | - dependencies[ node ] = (0, node) | |
98 | - inversedependencies[ node ] = [] | |
99 | - | |
100 | - | |
101 | - for depended, depends in routes: | |
102 | - # is it a null rule | |
103 | - try: | |
104 | - newdependencylevel, object = dependencies.get ( depends, (0, depends)) | |
105 | - except TypeError: | |
106 | - print depends | |
107 | - raise | |
108 | - dependencies[ depends ] = (newdependencylevel + 1, depends) | |
109 | - # "dependency (existence) of depended-on" | |
110 | - newdependencylevel,object = dependencies.get ( depended, (0, depended) ) | |
111 | - dependencies[ depended ] = (newdependencylevel, depended) | |
112 | - # Inverse dependency set up | |
113 | - dependencieslist = inversedependencies.get ( depended, []) | |
114 | - dependencieslist.append (depends) | |
115 | - inversedependencies[depended] = dependencieslist | |
116 | - ### Now we do the actual sorting | |
117 | - # The first task is to create the sortable | |
118 | - # list of dependency-levels | |
119 | - sortinglist = dependencies.values() | |
120 | - sortinglist.sort () | |
121 | - output = [] | |
122 | - while sortinglist: | |
123 | - deletelist = [] | |
124 | - generation = [] | |
125 | - output.append( generation) | |
126 | - while sortinglist and sortinglist[0][0] == 0: | |
127 | - number, object = sortinglist[0] | |
128 | - generation.append ( object ) | |
129 | - deletelist.append( object ) | |
130 | - for inverse in inversedependencies.get(object, () ): | |
131 | - try: | |
132 | - oldcount, inverse = dependencies [ inverse] | |
133 | - if oldcount > 0: | |
134 | - # will be dealt with on later pass | |
135 | - dependencies [ inverse] = (oldcount-1, inverse) | |
136 | - else: | |
137 | - # will be dealt with on this pass, | |
138 | - # so needs not to be in the sorting list next time | |
139 | - deletelist.append( inverse ) | |
140 | - # just in case a loop comes through | |
141 | - inversedependencies[object] = [] | |
142 | - except KeyError: | |
143 | - # dealing with a recursion-breaking run... | |
144 | - pass | |
145 | - del sortinglist [0] | |
146 | - # if no elements could be deleted, then | |
147 | - # there is something which depends upon itself | |
148 | - if not deletelist: | |
149 | - if noRecursion: | |
150 | - raise RecursionError( sortinglist ) | |
151 | - else: | |
152 | - # hack so that something gets deleted... | |
153 | -## import pdb | |
154 | -## pdb.set_trace() | |
155 | - dependencies[sortinglist[0][1]] = (0,sortinglist[0][1]) | |
156 | - # delete the items that were dealt with | |
157 | - for item in deletelist: | |
158 | - try: | |
159 | - del dependencies [ item ] | |
160 | - except KeyError: | |
161 | - pass | |
162 | - # need to recreate the sortinglist | |
163 | - sortinglist = dependencies.values() | |
164 | - if not generation: | |
165 | - output.remove( generation ) | |
166 | - sortinglist.sort () | |
167 | - return output | |
168 | - | |
169 | - | |
170 | - | |
171 | - | |
172 | - | |
173 | -if __name__ == "__main__": | |
174 | - | |
175 | - nodes = ['a', 'b', 'c', 'd', 'e', 'f'] | |
176 | - route = [('a', 'b'), ('b', 'c'), ('b', 'd'), ('e','f')] | |
177 | - | |
178 | - for x in toposort( nodes, route): | |
179 | - for a in x: | |
180 | - print a | |
181 | - | |
182 | - raise SystemExit | |
183 | - | |
184 | - | |
185 | - | |
186 | - import pprint, traceback | |
187 | - nodes= [ 0,1,2,3,4,5 ] | |
188 | - testingValues = [ | |
189 | - [ (0,1),(1,2),(2,3),(3,4),(4,5)], | |
190 | - [ (0,1),(0,2),(1,2),(3,4),(4,5)], | |
191 | - [ | |
192 | - (0,1), | |
193 | - (0,2), | |
194 | - (0,2), | |
195 | - (2,4), | |
196 | - (2,5), | |
197 | - (3,2), | |
198 | - (0,3)], | |
199 | - [ | |
200 | - (0,1), # 3-element cycle test, no orphan nodes | |
201 | - (1,2), | |
202 | - (2,0), | |
203 | - (2,4), | |
204 | - (2,5), | |
205 | - (3,2), | |
206 | - (0,3)], | |
207 | - [ | |
208 | - (0,1), | |
209 | - (1,1), | |
210 | - (1,1), | |
211 | - (1,4), | |
212 | - (1,5), | |
213 | - (1,2), | |
214 | - (3,1), | |
215 | - (2,1), | |
216 | - (2,0)], | |
217 | - [ | |
218 | - (0,1), | |
219 | - (1,0), | |
220 | - (0,2), | |
221 | - (0,3), | |
222 | - ], | |
223 | - [ | |
224 | - (0,1), | |
225 | - (1,0), | |
226 | - (0,2), | |
227 | - (3,1), | |
228 | - ], | |
229 | - ] | |
230 | - print 'sort, no recursion allowed' | |
231 | - for index in range(len(testingValues)): | |
232 | -## print ' %s -- %s'%( index, testingValues[index]) | |
233 | - try: | |
234 | - print ' ', sort( nodes, testingValues[index] ) | |
235 | - except: | |
236 | - print 'exception raised' | |
237 | - print 'toposort, no recursion allowed' | |
238 | - for index in range(len(testingValues)): | |
239 | -## print ' %s -- %s'%( index, testingValues[index]) | |
240 | - try: | |
241 | - print ' ', toposort( nodes, testingValues[index] ) | |
242 | - except: | |
243 | - print 'exception raised' | |
244 | - print 'sort, recursion allowed' | |
245 | - for index in range(len(testingValues)): | |
246 | -## print ' %s -- %s'%( index, testingValues[index]) | |
247 | - try: | |
248 | - print ' ', sort( nodes, testingValues[index],0 ) | |
249 | - except: | |
250 | - print 'exception raised' | |
251 | - print 'toposort, recursion allowed' | |
252 | - for index in range(len(testingValues)): | |
253 | -## print ' %s -- %s'%( index, testingValues[index]) | |
254 | - try: | |
255 | - print ' ', toposort( nodes, testingValues[index],0 ) | |
256 | - except: | |
257 | - print 'exception raised' | |
258 | - | |
259 | - | |
260 | - |
pacotes/openlayers/tools/update_dev_dir.sh
... | ... | @@ -1,45 +0,0 @@ |
1 | -#!/bin/sh | |
2 | - | |
3 | -# Used to update http://openlayers.org/dev/ | |
4 | - | |
5 | -svn up /www/openlayers/docs/dev; | |
6 | - | |
7 | -# Get current 'Last Changed Rev' | |
8 | -REV=`svn info /www/openlayers/docs/dev/ | grep 'Last Changed Rev' | awk '{print $4}'` | |
9 | - | |
10 | -# Get the last svn rev | |
11 | -touch /tmp/ol_svn_rev | |
12 | -OLD_REV="o`cat /tmp/ol_svn_rev`" | |
13 | - | |
14 | -# If they're not equal, do some work. | |
15 | -if [ ! o$REV = $OLD_REV ]; then | |
16 | - | |
17 | - cd /www/openlayers/docs/dev/tools/ | |
18 | - python exampleparser.py | |
19 | - cd /www/openlayers/docs/dev/build | |
20 | - ./build.py | |
21 | - | |
22 | - cp OpenLayers.js .. | |
23 | - cd .. | |
24 | - | |
25 | - sed -i -e 's!../lib/OpenLayers.js!../OpenLayers.js!' examples/*.html | |
26 | - perl /home/crschmidt/NaturalDocs -i /www/openlayers/docs/dev/lib -o HTML /www/openlayers/dev/apidocs -p /www/openlayers/docs/dev/apidoc_config -s Default OL >/dev/null | |
27 | - perl /home/crschmidt/NaturalDocs -i /www/openlayers/docs/dev/lib -o HTML /www/openlayers/dev/docs -p /www/openlayers/docs/dev/doc_config -s Default OL >/dev/null | |
28 | - | |
29 | - # Record the revision | |
30 | - echo -n $REV > /tmp/ol_svn_rev | |
31 | -fi | |
32 | - | |
33 | -svn up /www/openlayers/documentation-checkout | |
34 | -REV=`svn info /www/openlayers/documentation-checkout | grep 'Last Changed Rev' | awk '{print $4}'` | |
35 | -# Get the last svn rev | |
36 | -touch /tmp/ol_doc_rev | |
37 | -OLD_REV="o`cat /tmp/ol_doc_rev`" | |
38 | -# If they're not equal, do some work. | |
39 | -if [ ! o$REV = $OLD_REV ]; then | |
40 | - cd /www/openlayers/documentation-checkout | |
41 | - make html > /dev/null | |
42 | - cp -r _build/html/* /www/openlayers/documentation | |
43 | - | |
44 | - echo -n $REV > /tmp/ol_doc_rev | |
45 | -fi |