Commit f97f4db2002913e1a18d17d7988ff27e00b7b9c1
1 parent
4c12b068
Exists in
master
and in
7 other branches
--no commit message
Showing
12 changed files
with
0 additions
and
3250 deletions
Show diff stats
pacotes/openlayers/tools/BeautifulSoup.py
| ... | ... | @@ -1,1767 +0,0 @@ |
| 1 | -"""Beautiful Soup | |
| 2 | -Elixir and Tonic | |
| 3 | -"The Screen-Scraper's Friend" | |
| 4 | -http://www.crummy.com/software/BeautifulSoup/ | |
| 5 | - | |
| 6 | -Beautiful Soup parses a (possibly invalid) XML or HTML document into a | |
| 7 | -tree representation. It provides methods and Pythonic idioms that make | |
| 8 | -it easy to navigate, search, and modify the tree. | |
| 9 | - | |
| 10 | -A well-formed XML/HTML document yields a well-formed data | |
| 11 | -structure. An ill-formed XML/HTML document yields a correspondingly | |
| 12 | -ill-formed data structure. If your document is only locally | |
| 13 | -well-formed, you can use this library to find and process the | |
| 14 | -well-formed part of it. The BeautifulSoup class | |
| 15 | - | |
| 16 | -Beautiful Soup works with Python 2.2 and up. It has no external | |
| 17 | -dependencies, but you'll have more success at converting data to UTF-8 | |
| 18 | -if you also install these three packages: | |
| 19 | - | |
| 20 | -* chardet, for auto-detecting character encodings | |
| 21 | - http://chardet.feedparser.org/ | |
| 22 | -* cjkcodecs and iconv_codec, which add more encodings to the ones supported | |
| 23 | - by stock Python. | |
| 24 | - http://cjkpython.i18n.org/ | |
| 25 | - | |
| 26 | -Beautiful Soup defines classes for two main parsing strategies: | |
| 27 | - | |
| 28 | - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific | |
| 29 | - language that kind of looks like XML. | |
| 30 | - | |
| 31 | - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid | |
| 32 | - or invalid. This class has web browser-like heuristics for | |
| 33 | - obtaining a sensible parse tree in the face of common HTML errors. | |
| 34 | - | |
| 35 | -Beautiful Soup also defines a class (UnicodeDammit) for autodetecting | |
| 36 | -the encoding of an HTML or XML document, and converting it to | |
| 37 | -Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. | |
| 38 | - | |
| 39 | -For more than you ever wanted to know about Beautiful Soup, see the | |
| 40 | -documentation: | |
| 41 | -http://www.crummy.com/software/BeautifulSoup/documentation.html | |
| 42 | - | |
| 43 | -""" | |
| 44 | -from __future__ import generators | |
| 45 | - | |
| 46 | -__author__ = "Leonard Richardson (leonardr@segfault.org)" | |
| 47 | -__version__ = "3.0.4" | |
| 48 | -__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" | |
| 49 | -__license__ = "PSF" | |
| 50 | - | |
| 51 | -from sgmllib import SGMLParser, SGMLParseError | |
| 52 | -import codecs | |
| 53 | -import types | |
| 54 | -import re | |
| 55 | -import sgmllib | |
| 56 | -try: | |
| 57 | - from htmlentitydefs import name2codepoint | |
| 58 | -except ImportError: | |
| 59 | - name2codepoint = {} | |
| 60 | - | |
| 61 | -#This hack makes Beautiful Soup able to parse XML with namespaces | |
| 62 | -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') | |
| 63 | - | |
| 64 | -DEFAULT_OUTPUT_ENCODING = "utf-8" | |
| 65 | - | |
| 66 | -# First, the classes that represent markup elements. | |
| 67 | - | |
| 68 | -class PageElement: | |
| 69 | - """Contains the navigational information for some part of the page | |
| 70 | - (either a tag or a piece of text)""" | |
| 71 | - | |
| 72 | - def setup(self, parent=None, previous=None): | |
| 73 | - """Sets up the initial relations between this element and | |
| 74 | - other elements.""" | |
| 75 | - self.parent = parent | |
| 76 | - self.previous = previous | |
| 77 | - self.next = None | |
| 78 | - self.previousSibling = None | |
| 79 | - self.nextSibling = None | |
| 80 | - if self.parent and self.parent.contents: | |
| 81 | - self.previousSibling = self.parent.contents[-1] | |
| 82 | - self.previousSibling.nextSibling = self | |
| 83 | - | |
| 84 | - def replaceWith(self, replaceWith): | |
| 85 | - oldParent = self.parent | |
| 86 | - myIndex = self.parent.contents.index(self) | |
| 87 | - if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: | |
| 88 | - # We're replacing this element with one of its siblings. | |
| 89 | - index = self.parent.contents.index(replaceWith) | |
| 90 | - if index and index < myIndex: | |
| 91 | - # Furthermore, it comes before this element. That | |
| 92 | - # means that when we extract it, the index of this | |
| 93 | - # element will change. | |
| 94 | - myIndex = myIndex - 1 | |
| 95 | - self.extract() | |
| 96 | - oldParent.insert(myIndex, replaceWith) | |
| 97 | - | |
| 98 | - def extract(self): | |
| 99 | - """Destructively rips this element out of the tree.""" | |
| 100 | - if self.parent: | |
| 101 | - try: | |
| 102 | - self.parent.contents.remove(self) | |
| 103 | - except ValueError: | |
| 104 | - pass | |
| 105 | - | |
| 106 | - #Find the two elements that would be next to each other if | |
| 107 | - #this element (and any children) hadn't been parsed. Connect | |
| 108 | - #the two. | |
| 109 | - lastChild = self._lastRecursiveChild() | |
| 110 | - nextElement = lastChild.next | |
| 111 | - | |
| 112 | - if self.previous: | |
| 113 | - self.previous.next = nextElement | |
| 114 | - if nextElement: | |
| 115 | - nextElement.previous = self.previous | |
| 116 | - self.previous = None | |
| 117 | - lastChild.next = None | |
| 118 | - | |
| 119 | - self.parent = None | |
| 120 | - if self.previousSibling: | |
| 121 | - self.previousSibling.nextSibling = self.nextSibling | |
| 122 | - if self.nextSibling: | |
| 123 | - self.nextSibling.previousSibling = self.previousSibling | |
| 124 | - self.previousSibling = self.nextSibling = None | |
| 125 | - | |
| 126 | - def _lastRecursiveChild(self): | |
| 127 | - "Finds the last element beneath this object to be parsed." | |
| 128 | - lastChild = self | |
| 129 | - while hasattr(lastChild, 'contents') and lastChild.contents: | |
| 130 | - lastChild = lastChild.contents[-1] | |
| 131 | - return lastChild | |
| 132 | - | |
| 133 | - def insert(self, position, newChild): | |
| 134 | - if (isinstance(newChild, basestring) | |
| 135 | - or isinstance(newChild, unicode)) \ | |
| 136 | - and not isinstance(newChild, NavigableString): | |
| 137 | - newChild = NavigableString(newChild) | |
| 138 | - | |
| 139 | - position = min(position, len(self.contents)) | |
| 140 | - if hasattr(newChild, 'parent') and newChild.parent != None: | |
| 141 | - # We're 'inserting' an element that's already one | |
| 142 | - # of this object's children. | |
| 143 | - if newChild.parent == self: | |
| 144 | - index = self.find(newChild) | |
| 145 | - if index and index < position: | |
| 146 | - # Furthermore we're moving it further down the | |
| 147 | - # list of this object's children. That means that | |
| 148 | - # when we extract this element, our target index | |
| 149 | - # will jump down one. | |
| 150 | - position = position - 1 | |
| 151 | - newChild.extract() | |
| 152 | - | |
| 153 | - newChild.parent = self | |
| 154 | - previousChild = None | |
| 155 | - if position == 0: | |
| 156 | - newChild.previousSibling = None | |
| 157 | - newChild.previous = self | |
| 158 | - else: | |
| 159 | - previousChild = self.contents[position-1] | |
| 160 | - newChild.previousSibling = previousChild | |
| 161 | - newChild.previousSibling.nextSibling = newChild | |
| 162 | - newChild.previous = previousChild._lastRecursiveChild() | |
| 163 | - if newChild.previous: | |
| 164 | - newChild.previous.next = newChild | |
| 165 | - | |
| 166 | - newChildsLastElement = newChild._lastRecursiveChild() | |
| 167 | - | |
| 168 | - if position >= len(self.contents): | |
| 169 | - newChild.nextSibling = None | |
| 170 | - | |
| 171 | - parent = self | |
| 172 | - parentsNextSibling = None | |
| 173 | - while not parentsNextSibling: | |
| 174 | - parentsNextSibling = parent.nextSibling | |
| 175 | - parent = parent.parent | |
| 176 | - if not parent: # This is the last element in the document. | |
| 177 | - break | |
| 178 | - if parentsNextSibling: | |
| 179 | - newChildsLastElement.next = parentsNextSibling | |
| 180 | - else: | |
| 181 | - newChildsLastElement.next = None | |
| 182 | - else: | |
| 183 | - nextChild = self.contents[position] | |
| 184 | - newChild.nextSibling = nextChild | |
| 185 | - if newChild.nextSibling: | |
| 186 | - newChild.nextSibling.previousSibling = newChild | |
| 187 | - newChildsLastElement.next = nextChild | |
| 188 | - | |
| 189 | - if newChildsLastElement.next: | |
| 190 | - newChildsLastElement.next.previous = newChildsLastElement | |
| 191 | - self.contents.insert(position, newChild) | |
| 192 | - | |
| 193 | - def findNext(self, name=None, attrs={}, text=None, **kwargs): | |
| 194 | - """Returns the first item that matches the given criteria and | |
| 195 | - appears after this Tag in the document.""" | |
| 196 | - return self._findOne(self.findAllNext, name, attrs, text, **kwargs) | |
| 197 | - | |
| 198 | - def findAllNext(self, name=None, attrs={}, text=None, limit=None, | |
| 199 | - **kwargs): | |
| 200 | - """Returns all items that match the given criteria and appear | |
| 201 | - before after Tag in the document.""" | |
| 202 | - return self._findAll(name, attrs, text, limit, self.nextGenerator) | |
| 203 | - | |
| 204 | - def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): | |
| 205 | - """Returns the closest sibling to this Tag that matches the | |
| 206 | - given criteria and appears after this Tag in the document.""" | |
| 207 | - return self._findOne(self.findNextSiblings, name, attrs, text, | |
| 208 | - **kwargs) | |
| 209 | - | |
| 210 | - def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, | |
| 211 | - **kwargs): | |
| 212 | - """Returns the siblings of this Tag that match the given | |
| 213 | - criteria and appear after this Tag in the document.""" | |
| 214 | - return self._findAll(name, attrs, text, limit, | |
| 215 | - self.nextSiblingGenerator, **kwargs) | |
| 216 | - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x | |
| 217 | - | |
| 218 | - def findPrevious(self, name=None, attrs={}, text=None, **kwargs): | |
| 219 | - """Returns the first item that matches the given criteria and | |
| 220 | - appears before this Tag in the document.""" | |
| 221 | - return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) | |
| 222 | - | |
| 223 | - def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, | |
| 224 | - **kwargs): | |
| 225 | - """Returns all items that match the given criteria and appear | |
| 226 | - before this Tag in the document.""" | |
| 227 | - return self._findAll(name, attrs, text, limit, self.previousGenerator, | |
| 228 | - **kwargs) | |
| 229 | - fetchPrevious = findAllPrevious # Compatibility with pre-3.x | |
| 230 | - | |
| 231 | - def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): | |
| 232 | - """Returns the closest sibling to this Tag that matches the | |
| 233 | - given criteria and appears before this Tag in the document.""" | |
| 234 | - return self._findOne(self.findPreviousSiblings, name, attrs, text, | |
| 235 | - **kwargs) | |
| 236 | - | |
| 237 | - def findPreviousSiblings(self, name=None, attrs={}, text=None, | |
| 238 | - limit=None, **kwargs): | |
| 239 | - """Returns the siblings of this Tag that match the given | |
| 240 | - criteria and appear before this Tag in the document.""" | |
| 241 | - return self._findAll(name, attrs, text, limit, | |
| 242 | - self.previousSiblingGenerator, **kwargs) | |
| 243 | - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x | |
| 244 | - | |
| 245 | - def findParent(self, name=None, attrs={}, **kwargs): | |
| 246 | - """Returns the closest parent of this Tag that matches the given | |
| 247 | - criteria.""" | |
| 248 | - # NOTE: We can't use _findOne because findParents takes a different | |
| 249 | - # set of arguments. | |
| 250 | - r = None | |
| 251 | - l = self.findParents(name, attrs, 1) | |
| 252 | - if l: | |
| 253 | - r = l[0] | |
| 254 | - return r | |
| 255 | - | |
| 256 | - def findParents(self, name=None, attrs={}, limit=None, **kwargs): | |
| 257 | - """Returns the parents of this Tag that match the given | |
| 258 | - criteria.""" | |
| 259 | - | |
| 260 | - return self._findAll(name, attrs, None, limit, self.parentGenerator, | |
| 261 | - **kwargs) | |
| 262 | - fetchParents = findParents # Compatibility with pre-3.x | |
| 263 | - | |
| 264 | - #These methods do the real heavy lifting. | |
| 265 | - | |
| 266 | - def _findOne(self, method, name, attrs, text, **kwargs): | |
| 267 | - r = None | |
| 268 | - l = method(name, attrs, text, 1, **kwargs) | |
| 269 | - if l: | |
| 270 | - r = l[0] | |
| 271 | - return r | |
| 272 | - | |
| 273 | - def _findAll(self, name, attrs, text, limit, generator, **kwargs): | |
| 274 | - "Iterates over a generator looking for things that match." | |
| 275 | - | |
| 276 | - if isinstance(name, SoupStrainer): | |
| 277 | - strainer = name | |
| 278 | - else: | |
| 279 | - # Build a SoupStrainer | |
| 280 | - strainer = SoupStrainer(name, attrs, text, **kwargs) | |
| 281 | - results = ResultSet(strainer) | |
| 282 | - g = generator() | |
| 283 | - while True: | |
| 284 | - try: | |
| 285 | - i = g.next() | |
| 286 | - except StopIteration: | |
| 287 | - break | |
| 288 | - if i: | |
| 289 | - found = strainer.search(i) | |
| 290 | - if found: | |
| 291 | - results.append(found) | |
| 292 | - if limit and len(results) >= limit: | |
| 293 | - break | |
| 294 | - return results | |
| 295 | - | |
| 296 | - #These Generators can be used to navigate starting from both | |
| 297 | - #NavigableStrings and Tags. | |
| 298 | - def nextGenerator(self): | |
| 299 | - i = self | |
| 300 | - while i: | |
| 301 | - i = i.next | |
| 302 | - yield i | |
| 303 | - | |
| 304 | - def nextSiblingGenerator(self): | |
| 305 | - i = self | |
| 306 | - while i: | |
| 307 | - i = i.nextSibling | |
| 308 | - yield i | |
| 309 | - | |
| 310 | - def previousGenerator(self): | |
| 311 | - i = self | |
| 312 | - while i: | |
| 313 | - i = i.previous | |
| 314 | - yield i | |
| 315 | - | |
| 316 | - def previousSiblingGenerator(self): | |
| 317 | - i = self | |
| 318 | - while i: | |
| 319 | - i = i.previousSibling | |
| 320 | - yield i | |
| 321 | - | |
| 322 | - def parentGenerator(self): | |
| 323 | - i = self | |
| 324 | - while i: | |
| 325 | - i = i.parent | |
| 326 | - yield i | |
| 327 | - | |
| 328 | - # Utility methods | |
| 329 | - def substituteEncoding(self, str, encoding=None): | |
| 330 | - encoding = encoding or "utf-8" | |
| 331 | - return str.replace("%SOUP-ENCODING%", encoding) | |
| 332 | - | |
| 333 | - def toEncoding(self, s, encoding=None): | |
| 334 | - """Encodes an object to a string in some encoding, or to Unicode. | |
| 335 | - .""" | |
| 336 | - if isinstance(s, unicode): | |
| 337 | - if encoding: | |
| 338 | - s = s.encode(encoding) | |
| 339 | - elif isinstance(s, str): | |
| 340 | - if encoding: | |
| 341 | - s = s.encode(encoding) | |
| 342 | - else: | |
| 343 | - s = unicode(s) | |
| 344 | - else: | |
| 345 | - if encoding: | |
| 346 | - s = self.toEncoding(str(s), encoding) | |
| 347 | - else: | |
| 348 | - s = unicode(s) | |
| 349 | - return s | |
| 350 | - | |
| 351 | -class NavigableString(unicode, PageElement): | |
| 352 | - | |
| 353 | - def __getattr__(self, attr): | |
| 354 | - """text.string gives you text. This is for backwards | |
| 355 | - compatibility for Navigable*String, but for CData* it lets you | |
| 356 | - get the string without the CData wrapper.""" | |
| 357 | - if attr == 'string': | |
| 358 | - return self | |
| 359 | - else: | |
| 360 | - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) | |
| 361 | - | |
| 362 | - def __unicode__(self): | |
| 363 | - return self.__str__(None) | |
| 364 | - | |
| 365 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 366 | - if encoding: | |
| 367 | - return self.encode(encoding) | |
| 368 | - else: | |
| 369 | - return self | |
| 370 | - | |
| 371 | -class CData(NavigableString): | |
| 372 | - | |
| 373 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 374 | - return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) | |
| 375 | - | |
| 376 | -class ProcessingInstruction(NavigableString): | |
| 377 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 378 | - output = self | |
| 379 | - if "%SOUP-ENCODING%" in output: | |
| 380 | - output = self.substituteEncoding(output, encoding) | |
| 381 | - return "<?%s?>" % self.toEncoding(output, encoding) | |
| 382 | - | |
| 383 | -class Comment(NavigableString): | |
| 384 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 385 | - return "<!--%s-->" % NavigableString.__str__(self, encoding) | |
| 386 | - | |
| 387 | -class Declaration(NavigableString): | |
| 388 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 389 | - return "<!%s>" % NavigableString.__str__(self, encoding) | |
| 390 | - | |
| 391 | -class Tag(PageElement): | |
| 392 | - | |
| 393 | - """Represents a found HTML tag with its attributes and contents.""" | |
| 394 | - | |
| 395 | - XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", | |
| 396 | - '"' : "quote", | |
| 397 | - "&" : "amp", | |
| 398 | - "<" : "lt", | |
| 399 | - ">" : "gt" } | |
| 400 | - | |
| 401 | - def __init__(self, parser, name, attrs=None, parent=None, | |
| 402 | - previous=None): | |
| 403 | - "Basic constructor." | |
| 404 | - | |
| 405 | - # We don't actually store the parser object: that lets extracted | |
| 406 | - # chunks be garbage-collected | |
| 407 | - self.parserClass = parser.__class__ | |
| 408 | - self.isSelfClosing = parser.isSelfClosingTag(name) | |
| 409 | - self.name = name | |
| 410 | - if attrs == None: | |
| 411 | - attrs = [] | |
| 412 | - self.attrs = attrs | |
| 413 | - self.contents = [] | |
| 414 | - self.setup(parent, previous) | |
| 415 | - self.hidden = False | |
| 416 | - self.containsSubstitutions = False | |
| 417 | - | |
| 418 | - def get(self, key, default=None): | |
| 419 | - """Returns the value of the 'key' attribute for the tag, or | |
| 420 | - the value given for 'default' if it doesn't have that | |
| 421 | - attribute.""" | |
| 422 | - return self._getAttrMap().get(key, default) | |
| 423 | - | |
| 424 | - def has_key(self, key): | |
| 425 | - return self._getAttrMap().has_key(key) | |
| 426 | - | |
| 427 | - def __getitem__(self, key): | |
| 428 | - """tag[key] returns the value of the 'key' attribute for the tag, | |
| 429 | - and throws an exception if it's not there.""" | |
| 430 | - return self._getAttrMap()[key] | |
| 431 | - | |
| 432 | - def __iter__(self): | |
| 433 | - "Iterating over a tag iterates over its contents." | |
| 434 | - return iter(self.contents) | |
| 435 | - | |
| 436 | - def __len__(self): | |
| 437 | - "The length of a tag is the length of its list of contents." | |
| 438 | - return len(self.contents) | |
| 439 | - | |
| 440 | - def __contains__(self, x): | |
| 441 | - return x in self.contents | |
| 442 | - | |
| 443 | - def __nonzero__(self): | |
| 444 | - "A tag is non-None even if it has no contents." | |
| 445 | - return True | |
| 446 | - | |
| 447 | - def __setitem__(self, key, value): | |
| 448 | - """Setting tag[key] sets the value of the 'key' attribute for the | |
| 449 | - tag.""" | |
| 450 | - self._getAttrMap() | |
| 451 | - self.attrMap[key] = value | |
| 452 | - found = False | |
| 453 | - for i in range(0, len(self.attrs)): | |
| 454 | - if self.attrs[i][0] == key: | |
| 455 | - self.attrs[i] = (key, value) | |
| 456 | - found = True | |
| 457 | - if not found: | |
| 458 | - self.attrs.append((key, value)) | |
| 459 | - self._getAttrMap()[key] = value | |
| 460 | - | |
| 461 | - def __delitem__(self, key): | |
| 462 | - "Deleting tag[key] deletes all 'key' attributes for the tag." | |
| 463 | - for item in self.attrs: | |
| 464 | - if item[0] == key: | |
| 465 | - self.attrs.remove(item) | |
| 466 | - #We don't break because bad HTML can define the same | |
| 467 | - #attribute multiple times. | |
| 468 | - self._getAttrMap() | |
| 469 | - if self.attrMap.has_key(key): | |
| 470 | - del self.attrMap[key] | |
| 471 | - | |
| 472 | - def __call__(self, *args, **kwargs): | |
| 473 | - """Calling a tag like a function is the same as calling its | |
| 474 | - findAll() method. Eg. tag('a') returns a list of all the A tags | |
| 475 | - found within this tag.""" | |
| 476 | - return apply(self.findAll, args, kwargs) | |
| 477 | - | |
| 478 | - def __getattr__(self, tag): | |
| 479 | - #print "Getattr %s.%s" % (self.__class__, tag) | |
| 480 | - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: | |
| 481 | - return self.find(tag[:-3]) | |
| 482 | - elif tag.find('__') != 0: | |
| 483 | - return self.find(tag) | |
| 484 | - | |
| 485 | - def __eq__(self, other): | |
| 486 | - """Returns true iff this tag has the same name, the same attributes, | |
| 487 | - and the same contents (recursively) as the given tag. | |
| 488 | - | |
| 489 | - NOTE: right now this will return false if two tags have the | |
| 490 | - same attributes in a different order. Should this be fixed?""" | |
| 491 | - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): | |
| 492 | - return False | |
| 493 | - for i in range(0, len(self.contents)): | |
| 494 | - if self.contents[i] != other.contents[i]: | |
| 495 | - return False | |
| 496 | - return True | |
| 497 | - | |
| 498 | - def __ne__(self, other): | |
| 499 | - """Returns true iff this tag is not identical to the other tag, | |
| 500 | - as defined in __eq__.""" | |
| 501 | - return not self == other | |
| 502 | - | |
| 503 | - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 504 | - """Renders this tag as a string.""" | |
| 505 | - return self.__str__(encoding) | |
| 506 | - | |
| 507 | - def __unicode__(self): | |
| 508 | - return self.__str__(None) | |
| 509 | - | |
| 510 | - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
| 511 | - prettyPrint=False, indentLevel=0): | |
| 512 | - """Returns a string or Unicode representation of this tag and | |
| 513 | - its contents. To get Unicode, pass None for encoding. | |
| 514 | - | |
| 515 | - NOTE: since Python's HTML parser consumes whitespace, this | |
| 516 | - method is not certain to reproduce the whitespace present in | |
| 517 | - the original string.""" | |
| 518 | - | |
| 519 | - encodedName = self.toEncoding(self.name, encoding) | |
| 520 | - | |
| 521 | - attrs = [] | |
| 522 | - if self.attrs: | |
| 523 | - for key, val in self.attrs: | |
| 524 | - fmt = '%s="%s"' | |
| 525 | - if isString(val): | |
| 526 | - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: | |
| 527 | - val = self.substituteEncoding(val, encoding) | |
| 528 | - | |
| 529 | - # The attribute value either: | |
| 530 | - # | |
| 531 | - # * Contains no embedded double quotes or single quotes. | |
| 532 | - # No problem: we enclose it in double quotes. | |
| 533 | - # * Contains embedded single quotes. No problem: | |
| 534 | - # double quotes work here too. | |
| 535 | - # * Contains embedded double quotes. No problem: | |
| 536 | - # we enclose it in single quotes. | |
| 537 | - # * Embeds both single _and_ double quotes. This | |
| 538 | - # can't happen naturally, but it can happen if | |
| 539 | - # you modify an attribute value after parsing | |
| 540 | - # the document. Now we have a bit of a | |
| 541 | - # problem. We solve it by enclosing the | |
| 542 | - # attribute in single quotes, and escaping any | |
| 543 | - # embedded single quotes to XML entities. | |
| 544 | - if '"' in val: | |
| 545 | - fmt = "%s='%s'" | |
| 546 | - # This can't happen naturally, but it can happen | |
| 547 | - # if you modify an attribute value after parsing. | |
| 548 | - if "'" in val: | |
| 549 | - val = val.replace("'", "&squot;") | |
| 550 | - | |
| 551 | - # Now we're okay w/r/t quotes. But the attribute | |
| 552 | - # value might also contain angle brackets, or | |
| 553 | - # ampersands that aren't part of entities. We need | |
| 554 | - # to escape those to XML entities too. | |
| 555 | - val = re.sub("([<>]|&(?![^\s]+;))", | |
| 556 | - lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", | |
| 557 | - val) | |
| 558 | - | |
| 559 | - attrs.append(fmt % (self.toEncoding(key, encoding), | |
| 560 | - self.toEncoding(val, encoding))) | |
| 561 | - close = '' | |
| 562 | - closeTag = '' | |
| 563 | - if self.isSelfClosing: | |
| 564 | - close = ' /' | |
| 565 | - else: | |
| 566 | - closeTag = '</%s>' % encodedName | |
| 567 | - | |
| 568 | - indentTag, indentContents = 0, 0 | |
| 569 | - if prettyPrint: | |
| 570 | - indentTag = indentLevel | |
| 571 | - space = (' ' * (indentTag-1)) | |
| 572 | - indentContents = indentTag + 1 | |
| 573 | - contents = self.renderContents(encoding, prettyPrint, indentContents) | |
| 574 | - if self.hidden: | |
| 575 | - s = contents | |
| 576 | - else: | |
| 577 | - s = [] | |
| 578 | - attributeString = '' | |
| 579 | - if attrs: | |
| 580 | - attributeString = ' ' + ' '.join(attrs) | |
| 581 | - if prettyPrint: | |
| 582 | - s.append(space) | |
| 583 | - s.append('<%s%s%s>' % (encodedName, attributeString, close)) | |
| 584 | - if prettyPrint: | |
| 585 | - s.append("\n") | |
| 586 | - s.append(contents) | |
| 587 | - if prettyPrint and contents and contents[-1] != "\n": | |
| 588 | - s.append("\n") | |
| 589 | - if prettyPrint and closeTag: | |
| 590 | - s.append(space) | |
| 591 | - s.append(closeTag) | |
| 592 | - if prettyPrint and closeTag and self.nextSibling: | |
| 593 | - s.append("\n") | |
| 594 | - s = ''.join(s) | |
| 595 | - return s | |
| 596 | - | |
| 597 | - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): | |
| 598 | - return self.__str__(encoding, True) | |
| 599 | - | |
| 600 | - def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, | |
| 601 | - prettyPrint=False, indentLevel=0): | |
| 602 | - """Renders the contents of this tag as a string in the given | |
| 603 | - encoding. If encoding is None, returns a Unicode string..""" | |
| 604 | - s=[] | |
| 605 | - for c in self: | |
| 606 | - text = None | |
| 607 | - if isinstance(c, NavigableString): | |
| 608 | - text = c.__str__(encoding) | |
| 609 | - elif isinstance(c, Tag): | |
| 610 | - s.append(c.__str__(encoding, prettyPrint, indentLevel)) | |
| 611 | - if text and prettyPrint: | |
| 612 | - text = text.strip() | |
| 613 | - if text: | |
| 614 | - if prettyPrint: | |
| 615 | - s.append(" " * (indentLevel-1)) | |
| 616 | - s.append(text) | |
| 617 | - if prettyPrint: | |
| 618 | - s.append("\n") | |
| 619 | - return ''.join(s) | |
| 620 | - | |
| 621 | - #Soup methods | |
| 622 | - | |
| 623 | - def find(self, name=None, attrs={}, recursive=True, text=None, | |
| 624 | - **kwargs): | |
| 625 | - """Return only the first child of this Tag matching the given | |
| 626 | - criteria.""" | |
| 627 | - r = None | |
| 628 | - l = self.findAll(name, attrs, recursive, text, 1, **kwargs) | |
| 629 | - if l: | |
| 630 | - r = l[0] | |
| 631 | - return r | |
| 632 | - findChild = find | |
| 633 | - | |
| 634 | - def findAll(self, name=None, attrs={}, recursive=True, text=None, | |
| 635 | - limit=None, **kwargs): | |
| 636 | - """Extracts a list of Tag objects that match the given | |
| 637 | - criteria. You can specify the name of the Tag and any | |
| 638 | - attributes you want the Tag to have. | |
| 639 | - | |
| 640 | - The value of a key-value pair in the 'attrs' map can be a | |
| 641 | - string, a list of strings, a regular expression object, or a | |
| 642 | - callable that takes a string and returns whether or not the | |
| 643 | - string matches for some custom definition of 'matches'. The | |
| 644 | - same is true of the tag name.""" | |
| 645 | - generator = self.recursiveChildGenerator | |
| 646 | - if not recursive: | |
| 647 | - generator = self.childGenerator | |
| 648 | - return self._findAll(name, attrs, text, limit, generator, **kwargs) | |
| 649 | - findChildren = findAll | |
| 650 | - | |
| 651 | - # Pre-3.x compatibility methods | |
| 652 | - first = find | |
| 653 | - fetch = findAll | |
| 654 | - | |
| 655 | - def fetchText(self, text=None, recursive=True, limit=None): | |
| 656 | - return self.findAll(text=text, recursive=recursive, limit=limit) | |
| 657 | - | |
| 658 | - def firstText(self, text=None, recursive=True): | |
| 659 | - return self.find(text=text, recursive=recursive) | |
| 660 | - | |
| 661 | - #Utility methods | |
| 662 | - | |
| 663 | - def append(self, tag): | |
| 664 | - """Appends the given tag to the contents of this tag.""" | |
| 665 | - self.contents.append(tag) | |
| 666 | - | |
| 667 | - #Private methods | |
| 668 | - | |
| 669 | - def _getAttrMap(self): | |
| 670 | - """Initializes a map representation of this tag's attributes, | |
| 671 | - if not already initialized.""" | |
| 672 | - if not getattr(self, 'attrMap'): | |
| 673 | - self.attrMap = {} | |
| 674 | - for (key, value) in self.attrs: | |
| 675 | - self.attrMap[key] = value | |
| 676 | - return self.attrMap | |
| 677 | - | |
| 678 | - #Generator methods | |
| 679 | - def childGenerator(self): | |
| 680 | - for i in range(0, len(self.contents)): | |
| 681 | - yield self.contents[i] | |
| 682 | - raise StopIteration | |
| 683 | - | |
| 684 | - def recursiveChildGenerator(self): | |
| 685 | - stack = [(self, 0)] | |
| 686 | - while stack: | |
| 687 | - tag, start = stack.pop() | |
| 688 | - if isinstance(tag, Tag): | |
| 689 | - for i in range(start, len(tag.contents)): | |
| 690 | - a = tag.contents[i] | |
| 691 | - yield a | |
| 692 | - if isinstance(a, Tag) and tag.contents: | |
| 693 | - if i < len(tag.contents) - 1: | |
| 694 | - stack.append((tag, i+1)) | |
| 695 | - stack.append((a, 0)) | |
| 696 | - break | |
| 697 | - raise StopIteration | |
| 698 | - | |
| 699 | -# Next, a couple classes to represent queries and their results. | |
| 700 | -class SoupStrainer: | |
| 701 | - """Encapsulates a number of ways of matching a markup element (tag or | |
| 702 | - text).""" | |
| 703 | - | |
| 704 | - def __init__(self, name=None, attrs={}, text=None, **kwargs): | |
| 705 | - self.name = name | |
| 706 | - if isString(attrs): | |
| 707 | - kwargs['class'] = attrs | |
| 708 | - attrs = None | |
| 709 | - if kwargs: | |
| 710 | - if attrs: | |
| 711 | - attrs = attrs.copy() | |
| 712 | - attrs.update(kwargs) | |
| 713 | - else: | |
| 714 | - attrs = kwargs | |
| 715 | - self.attrs = attrs | |
| 716 | - self.text = text | |
| 717 | - | |
| 718 | - def __str__(self): | |
| 719 | - if self.text: | |
| 720 | - return self.text | |
| 721 | - else: | |
| 722 | - return "%s|%s" % (self.name, self.attrs) | |
| 723 | - | |
| 724 | - def searchTag(self, markupName=None, markupAttrs={}): | |
| 725 | - found = None | |
| 726 | - markup = None | |
| 727 | - if isinstance(markupName, Tag): | |
| 728 | - markup = markupName | |
| 729 | - markupAttrs = markup | |
| 730 | - callFunctionWithTagData = callable(self.name) \ | |
| 731 | - and not isinstance(markupName, Tag) | |
| 732 | - | |
| 733 | - if (not self.name) \ | |
| 734 | - or callFunctionWithTagData \ | |
| 735 | - or (markup and self._matches(markup, self.name)) \ | |
| 736 | - or (not markup and self._matches(markupName, self.name)): | |
| 737 | - if callFunctionWithTagData: | |
| 738 | - match = self.name(markupName, markupAttrs) | |
| 739 | - else: | |
| 740 | - match = True | |
| 741 | - markupAttrMap = None | |
| 742 | - for attr, matchAgainst in self.attrs.items(): | |
| 743 | - if not markupAttrMap: | |
| 744 | - if hasattr(markupAttrs, 'get'): | |
| 745 | - markupAttrMap = markupAttrs | |
| 746 | - else: | |
| 747 | - markupAttrMap = {} | |
| 748 | - for k,v in markupAttrs: | |
| 749 | - markupAttrMap[k] = v | |
| 750 | - attrValue = markupAttrMap.get(attr) | |
| 751 | - if not self._matches(attrValue, matchAgainst): | |
| 752 | - match = False | |
| 753 | - break | |
| 754 | - if match: | |
| 755 | - if markup: | |
| 756 | - found = markup | |
| 757 | - else: | |
| 758 | - found = markupName | |
| 759 | - return found | |
| 760 | - | |
| 761 | - def search(self, markup): | |
| 762 | - #print 'looking for %s in %s' % (self, markup) | |
| 763 | - found = None | |
| 764 | - # If given a list of items, scan it for a text element that | |
| 765 | - # matches. | |
| 766 | - if isList(markup) and not isinstance(markup, Tag): | |
| 767 | - for element in markup: | |
| 768 | - if isinstance(element, NavigableString) \ | |
| 769 | - and self.search(element): | |
| 770 | - found = element | |
| 771 | - break | |
| 772 | - # If it's a Tag, make sure its name or attributes match. | |
| 773 | - # Don't bother with Tags if we're searching for text. | |
| 774 | - elif isinstance(markup, Tag): | |
| 775 | - if not self.text: | |
| 776 | - found = self.searchTag(markup) | |
| 777 | - # If it's text, make sure the text matches. | |
| 778 | - elif isinstance(markup, NavigableString) or \ | |
| 779 | - isString(markup): | |
| 780 | - if self._matches(markup, self.text): | |
| 781 | - found = markup | |
| 782 | - else: | |
| 783 | - raise Exception, "I don't know how to match against a %s" \ | |
| 784 | - % markup.__class__ | |
| 785 | - return found | |
| 786 | - | |
| 787 | - def _matches(self, markup, matchAgainst): | |
| 788 | - #print "Matching %s against %s" % (markup, matchAgainst) | |
| 789 | - result = False | |
| 790 | - if matchAgainst == True and type(matchAgainst) == types.BooleanType: | |
| 791 | - result = markup != None | |
| 792 | - elif callable(matchAgainst): | |
| 793 | - result = matchAgainst(markup) | |
| 794 | - else: | |
| 795 | - #Custom match methods take the tag as an argument, but all | |
| 796 | - #other ways of matching match the tag name as a string. | |
| 797 | - if isinstance(markup, Tag): | |
| 798 | - markup = markup.name | |
| 799 | - if markup and not isString(markup): | |
| 800 | - markup = unicode(markup) | |
| 801 | - #Now we know that chunk is either a string, or None. | |
| 802 | - if hasattr(matchAgainst, 'match'): | |
| 803 | - # It's a regexp object. | |
| 804 | - result = markup and matchAgainst.search(markup) | |
| 805 | - elif isList(matchAgainst): | |
| 806 | - result = markup in matchAgainst | |
| 807 | - elif hasattr(matchAgainst, 'items'): | |
| 808 | - result = markup.has_key(matchAgainst) | |
| 809 | - elif matchAgainst and isString(markup): | |
| 810 | - if isinstance(markup, unicode): | |
| 811 | - matchAgainst = unicode(matchAgainst) | |
| 812 | - else: | |
| 813 | - matchAgainst = str(matchAgainst) | |
| 814 | - | |
| 815 | - if not result: | |
| 816 | - result = matchAgainst == markup | |
| 817 | - return result | |
| 818 | - | |
| 819 | -class ResultSet(list): | |
| 820 | - """A ResultSet is just a list that keeps track of the SoupStrainer | |
| 821 | - that created it.""" | |
| 822 | - def __init__(self, source): | |
| 823 | - list.__init__([]) | |
| 824 | - self.source = source | |
| 825 | - | |
| 826 | -# Now, some helper functions. | |
| 827 | - | |
| 828 | -def isList(l): | |
| 829 | - """Convenience method that works with all 2.x versions of Python | |
| 830 | - to determine whether or not something is listlike.""" | |
| 831 | - return hasattr(l, '__iter__') \ | |
| 832 | - or (type(l) in (types.ListType, types.TupleType)) | |
| 833 | - | |
| 834 | -def isString(s): | |
| 835 | - """Convenience method that works with all 2.x versions of Python | |
| 836 | - to determine whether or not something is stringlike.""" | |
| 837 | - try: | |
| 838 | - return isinstance(s, unicode) or isintance(s, basestring) | |
| 839 | - except NameError: | |
| 840 | - return isinstance(s, str) | |
| 841 | - | |
| 842 | -def buildTagMap(default, *args): | |
| 843 | - """Turns a list of maps, lists, or scalars into a single map. | |
| 844 | - Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and | |
| 845 | - NESTING_RESET_TAGS maps out of lists and partial maps.""" | |
| 846 | - built = {} | |
| 847 | - for portion in args: | |
| 848 | - if hasattr(portion, 'items'): | |
| 849 | - #It's a map. Merge it. | |
| 850 | - for k,v in portion.items(): | |
| 851 | - built[k] = v | |
| 852 | - elif isList(portion): | |
| 853 | - #It's a list. Map each item to the default. | |
| 854 | - for k in portion: | |
| 855 | - built[k] = default | |
| 856 | - else: | |
| 857 | - #It's a scalar. Map it to the default. | |
| 858 | - built[portion] = default | |
| 859 | - return built | |
| 860 | - | |
| 861 | -# Now, the parser classes. | |
| 862 | - | |
| 863 | -class BeautifulStoneSoup(Tag, SGMLParser): | |
| 864 | - | |
| 865 | - """This class contains the basic parser and search code. It defines | |
| 866 | - a parser that knows nothing about tag behavior except for the | |
| 867 | - following: | |
| 868 | - | |
| 869 | - You can't close a tag without closing all the tags it encloses. | |
| 870 | - That is, "<foo><bar></foo>" actually means | |
| 871 | - "<foo><bar></bar></foo>". | |
| 872 | - | |
| 873 | - [Another possible explanation is "<foo><bar /></foo>", but since | |
| 874 | - this class defines no SELF_CLOSING_TAGS, it will never use that | |
| 875 | - explanation.] | |
| 876 | - | |
| 877 | - This class is useful for parsing XML or made-up markup languages, | |
| 878 | - or when BeautifulSoup makes an assumption counter to what you were | |
| 879 | - expecting.""" | |
| 880 | - | |
| 881 | - XML_ENTITY_LIST = {} | |
| 882 | - for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): | |
| 883 | - XML_ENTITY_LIST[i] = True | |
| 884 | - | |
| 885 | - SELF_CLOSING_TAGS = {} | |
| 886 | - NESTABLE_TAGS = {} | |
| 887 | - RESET_NESTING_TAGS = {} | |
| 888 | - QUOTE_TAGS = {} | |
| 889 | - | |
| 890 | - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), | |
| 891 | - lambda x: x.group(1) + ' />'), | |
| 892 | - (re.compile('<!\s+([^<>]*)>'), | |
| 893 | - lambda x: '<!' + x.group(1) + '>') | |
| 894 | - ] | |
| 895 | - | |
| 896 | - ROOT_TAG_NAME = u'[document]' | |
| 897 | - | |
| 898 | - HTML_ENTITIES = "html" | |
| 899 | - XML_ENTITIES = "xml" | |
| 900 | - | |
| 901 | - def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, | |
| 902 | - markupMassage=True, smartQuotesTo=XML_ENTITIES, | |
| 903 | - convertEntities=None, selfClosingTags=None): | |
| 904 | - """The Soup object is initialized as the 'root tag', and the | |
| 905 | - provided markup (which can be a string or a file-like object) | |
| 906 | - is fed into the underlying parser. | |
| 907 | - | |
| 908 | - sgmllib will process most bad HTML, and the BeautifulSoup | |
| 909 | - class has some tricks for dealing with some HTML that kills | |
| 910 | - sgmllib, but Beautiful Soup can nonetheless choke or lose data | |
| 911 | - if your data uses self-closing tags or declarations | |
| 912 | - incorrectly. | |
| 913 | - | |
| 914 | - By default, Beautiful Soup uses regexes to sanitize input, | |
| 915 | - avoiding the vast majority of these problems. If the problems | |
| 916 | - don't apply to you, pass in False for markupMassage, and | |
| 917 | - you'll get better performance. | |
| 918 | - | |
| 919 | - The default parser massage techniques fix the two most common | |
| 920 | - instances of invalid HTML that choke sgmllib: | |
| 921 | - | |
| 922 | - <br/> (No space between name of closing tag and tag close) | |
| 923 | - <! --Comment--> (Extraneous whitespace in declaration) | |
| 924 | - | |
| 925 | - You can pass in a custom list of (RE object, replace method) | |
| 926 | - tuples to get Beautiful Soup to scrub your input the way you | |
| 927 | - want.""" | |
| 928 | - | |
| 929 | - self.parseOnlyThese = parseOnlyThese | |
| 930 | - self.fromEncoding = fromEncoding | |
| 931 | - self.smartQuotesTo = smartQuotesTo | |
| 932 | - self.convertEntities = convertEntities | |
| 933 | - if self.convertEntities: | |
| 934 | - # It doesn't make sense to convert encoded characters to | |
| 935 | - # entities even while you're converting entities to Unicode. | |
| 936 | - # Just convert it all to Unicode. | |
| 937 | - self.smartQuotesTo = None | |
| 938 | - self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) | |
| 939 | - SGMLParser.__init__(self) | |
| 940 | - | |
| 941 | - if hasattr(markup, 'read'): # It's a file-type object. | |
| 942 | - markup = markup.read() | |
| 943 | - self.markup = markup | |
| 944 | - self.markupMassage = markupMassage | |
| 945 | - try: | |
| 946 | - self._feed() | |
| 947 | - except StopParsing: | |
| 948 | - pass | |
| 949 | - self.markup = None # The markup can now be GCed | |
| 950 | - | |
| 951 | - def _feed(self, inDocumentEncoding=None): | |
| 952 | - # Convert the document to Unicode. | |
| 953 | - markup = self.markup | |
| 954 | - if isinstance(markup, unicode): | |
| 955 | - if not hasattr(self, 'originalEncoding'): | |
| 956 | - self.originalEncoding = None | |
| 957 | - else: | |
| 958 | - dammit = UnicodeDammit\ | |
| 959 | - (markup, [self.fromEncoding, inDocumentEncoding], | |
| 960 | - smartQuotesTo=self.smartQuotesTo) | |
| 961 | - markup = dammit.unicode | |
| 962 | - self.originalEncoding = dammit.originalEncoding | |
| 963 | - if markup: | |
| 964 | - if self.markupMassage: | |
| 965 | - if not isList(self.markupMassage): | |
| 966 | - self.markupMassage = self.MARKUP_MASSAGE | |
| 967 | - for fix, m in self.markupMassage: | |
| 968 | - markup = fix.sub(m, markup) | |
| 969 | - self.reset() | |
| 970 | - | |
| 971 | - SGMLParser.feed(self, markup) | |
| 972 | - # Close out any unfinished strings and close all the open tags. | |
| 973 | - self.endData() | |
| 974 | - while self.currentTag.name != self.ROOT_TAG_NAME: | |
| 975 | - self.popTag() | |
| 976 | - | |
| 977 | - def __getattr__(self, methodName): | |
| 978 | - """This method routes method call requests to either the SGMLParser | |
| 979 | - superclass or the Tag superclass, depending on the method name.""" | |
| 980 | - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) | |
| 981 | - | |
| 982 | - if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ | |
| 983 | - or methodName.find('do_') == 0: | |
| 984 | - return SGMLParser.__getattr__(self, methodName) | |
| 985 | - elif methodName.find('__') != 0: | |
| 986 | - return Tag.__getattr__(self, methodName) | |
| 987 | - else: | |
| 988 | - raise AttributeError | |
| 989 | - | |
| 990 | - def isSelfClosingTag(self, name): | |
| 991 | - """Returns true iff the given string is the name of a | |
| 992 | - self-closing tag according to this parser.""" | |
| 993 | - return self.SELF_CLOSING_TAGS.has_key(name) \ | |
| 994 | - or self.instanceSelfClosingTags.has_key(name) | |
| 995 | - | |
| 996 | - def reset(self): | |
| 997 | - Tag.__init__(self, self, self.ROOT_TAG_NAME) | |
| 998 | - self.hidden = 1 | |
| 999 | - SGMLParser.reset(self) | |
| 1000 | - self.currentData = [] | |
| 1001 | - self.currentTag = None | |
| 1002 | - self.tagStack = [] | |
| 1003 | - self.quoteStack = [] | |
| 1004 | - self.pushTag(self) | |
| 1005 | - | |
| 1006 | - def popTag(self): | |
| 1007 | - tag = self.tagStack.pop() | |
| 1008 | - # Tags with just one string-owning child get the child as a | |
| 1009 | - # 'string' property, so that soup.tag.string is shorthand for | |
| 1010 | - # soup.tag.contents[0] | |
| 1011 | - if len(self.currentTag.contents) == 1 and \ | |
| 1012 | - isinstance(self.currentTag.contents[0], NavigableString): | |
| 1013 | - self.currentTag.string = self.currentTag.contents[0] | |
| 1014 | - | |
| 1015 | - #print "Pop", tag.name | |
| 1016 | - if self.tagStack: | |
| 1017 | - self.currentTag = self.tagStack[-1] | |
| 1018 | - return self.currentTag | |
| 1019 | - | |
| 1020 | - def pushTag(self, tag): | |
| 1021 | - #print "Push", tag.name | |
| 1022 | - if self.currentTag: | |
| 1023 | - self.currentTag.append(tag) | |
| 1024 | - self.tagStack.append(tag) | |
| 1025 | - self.currentTag = self.tagStack[-1] | |
| 1026 | - | |
| 1027 | - def endData(self, containerClass=NavigableString): | |
| 1028 | - if self.currentData: | |
| 1029 | - currentData = ''.join(self.currentData) | |
| 1030 | - if not currentData.strip(): | |
| 1031 | - if '\n' in currentData: | |
| 1032 | - currentData = '\n' | |
| 1033 | - else: | |
| 1034 | - currentData = ' ' | |
| 1035 | - self.currentData = [] | |
| 1036 | - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ | |
| 1037 | - (not self.parseOnlyThese.text or \ | |
| 1038 | - not self.parseOnlyThese.search(currentData)): | |
| 1039 | - return | |
| 1040 | - o = containerClass(currentData) | |
| 1041 | - o.setup(self.currentTag, self.previous) | |
| 1042 | - if self.previous: | |
| 1043 | - self.previous.next = o | |
| 1044 | - self.previous = o | |
| 1045 | - self.currentTag.contents.append(o) | |
| 1046 | - | |
| 1047 | - | |
| 1048 | - def _popToTag(self, name, inclusivePop=True): | |
| 1049 | - """Pops the tag stack up to and including the most recent | |
| 1050 | - instance of the given tag. If inclusivePop is false, pops the tag | |
| 1051 | - stack up to but *not* including the most recent instqance of | |
| 1052 | - the given tag.""" | |
| 1053 | - #print "Popping to %s" % name | |
| 1054 | - if name == self.ROOT_TAG_NAME: | |
| 1055 | - return | |
| 1056 | - | |
| 1057 | - numPops = 0 | |
| 1058 | - mostRecentTag = None | |
| 1059 | - for i in range(len(self.tagStack)-1, 0, -1): | |
| 1060 | - if name == self.tagStack[i].name: | |
| 1061 | - numPops = len(self.tagStack)-i | |
| 1062 | - break | |
| 1063 | - if not inclusivePop: | |
| 1064 | - numPops = numPops - 1 | |
| 1065 | - | |
| 1066 | - for i in range(0, numPops): | |
| 1067 | - mostRecentTag = self.popTag() | |
| 1068 | - return mostRecentTag | |
| 1069 | - | |
| 1070 | - def _smartPop(self, name): | |
| 1071 | - | |
| 1072 | - """We need to pop up to the previous tag of this type, unless | |
| 1073 | - one of this tag's nesting reset triggers comes between this | |
| 1074 | - tag and the previous tag of this type, OR unless this tag is a | |
| 1075 | - generic nesting trigger and another generic nesting trigger | |
| 1076 | - comes between this tag and the previous tag of this type. | |
| 1077 | - | |
| 1078 | - Examples: | |
| 1079 | - <p>Foo<b>Bar<p> should pop to 'p', not 'b'. | |
| 1080 | - <p>Foo<table>Bar<p> should pop to 'table', not 'p'. | |
| 1081 | - <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. | |
| 1082 | - <p>Foo<b>Bar<p> should pop to 'p', not 'b'. | |
| 1083 | - | |
| 1084 | - <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. | |
| 1085 | - <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' | |
| 1086 | - <td><tr><td> *<td>* should pop to 'tr', not the first 'td' | |
| 1087 | - """ | |
| 1088 | - | |
| 1089 | - nestingResetTriggers = self.NESTABLE_TAGS.get(name) | |
| 1090 | - isNestable = nestingResetTriggers != None | |
| 1091 | - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) | |
| 1092 | - popTo = None | |
| 1093 | - inclusive = True | |
| 1094 | - for i in range(len(self.tagStack)-1, 0, -1): | |
| 1095 | - p = self.tagStack[i] | |
| 1096 | - if (not p or p.name == name) and not isNestable: | |
| 1097 | - #Non-nestable tags get popped to the top or to their | |
| 1098 | - #last occurance. | |
| 1099 | - popTo = name | |
| 1100 | - break | |
| 1101 | - if (nestingResetTriggers != None | |
| 1102 | - and p.name in nestingResetTriggers) \ | |
| 1103 | - or (nestingResetTriggers == None and isResetNesting | |
| 1104 | - and self.RESET_NESTING_TAGS.has_key(p.name)): | |
| 1105 | - | |
| 1106 | - #If we encounter one of the nesting reset triggers | |
| 1107 | - #peculiar to this tag, or we encounter another tag | |
| 1108 | - #that causes nesting to reset, pop up to but not | |
| 1109 | - #including that tag. | |
| 1110 | - popTo = p.name | |
| 1111 | - inclusive = False | |
| 1112 | - break | |
| 1113 | - p = p.parent | |
| 1114 | - if popTo: | |
| 1115 | - self._popToTag(popTo, inclusive) | |
| 1116 | - | |
| 1117 | - def unknown_starttag(self, name, attrs, selfClosing=0): | |
| 1118 | - #print "Start tag %s: %s" % (name, attrs) | |
| 1119 | - if self.quoteStack: | |
| 1120 | - #This is not a real tag. | |
| 1121 | - #print "<%s> is not real!" % name | |
| 1122 | - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) | |
| 1123 | - self.handle_data('<%s%s>' % (name, attrs)) | |
| 1124 | - return | |
| 1125 | - self.endData() | |
| 1126 | - | |
| 1127 | - if not self.isSelfClosingTag(name) and not selfClosing: | |
| 1128 | - self._smartPop(name) | |
| 1129 | - | |
| 1130 | - if self.parseOnlyThese and len(self.tagStack) <= 1 \ | |
| 1131 | - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): | |
| 1132 | - return | |
| 1133 | - | |
| 1134 | - tag = Tag(self, name, attrs, self.currentTag, self.previous) | |
| 1135 | - if self.previous: | |
| 1136 | - self.previous.next = tag | |
| 1137 | - self.previous = tag | |
| 1138 | - self.pushTag(tag) | |
| 1139 | - if selfClosing or self.isSelfClosingTag(name): | |
| 1140 | - self.popTag() | |
| 1141 | - if name in self.QUOTE_TAGS: | |
| 1142 | - #print "Beginning quote (%s)" % name | |
| 1143 | - self.quoteStack.append(name) | |
| 1144 | - self.literal = 1 | |
| 1145 | - return tag | |
| 1146 | - | |
| 1147 | - def unknown_endtag(self, name): | |
| 1148 | - #print "End tag %s" % name | |
| 1149 | - if self.quoteStack and self.quoteStack[-1] != name: | |
| 1150 | - #This is not a real end tag. | |
| 1151 | - #print "</%s> is not real!" % name | |
| 1152 | - self.handle_data('</%s>' % name) | |
| 1153 | - return | |
| 1154 | - self.endData() | |
| 1155 | - self._popToTag(name) | |
| 1156 | - if self.quoteStack and self.quoteStack[-1] == name: | |
| 1157 | - self.quoteStack.pop() | |
| 1158 | - self.literal = (len(self.quoteStack) > 0) | |
| 1159 | - | |
| 1160 | - def handle_data(self, data): | |
| 1161 | - self.currentData.append(data) | |
| 1162 | - | |
| 1163 | - def _toStringSubclass(self, text, subclass): | |
| 1164 | - """Adds a certain piece of text to the tree as a NavigableString | |
| 1165 | - subclass.""" | |
| 1166 | - self.endData() | |
| 1167 | - self.handle_data(text) | |
| 1168 | - self.endData(subclass) | |
| 1169 | - | |
| 1170 | - def handle_pi(self, text): | |
| 1171 | - """Handle a processing instruction as a ProcessingInstruction | |
| 1172 | - object, possibly one with a %SOUP-ENCODING% slot into which an | |
| 1173 | - encoding will be plugged later.""" | |
| 1174 | - if text[:3] == "xml": | |
| 1175 | - text = "xml version='1.0' encoding='%SOUP-ENCODING%'" | |
| 1176 | - self._toStringSubclass(text, ProcessingInstruction) | |
| 1177 | - | |
| 1178 | - def handle_comment(self, text): | |
| 1179 | - "Handle comments as Comment objects." | |
| 1180 | - self._toStringSubclass(text, Comment) | |
| 1181 | - | |
| 1182 | - def handle_charref(self, ref): | |
| 1183 | - "Handle character references as data." | |
| 1184 | - if self.convertEntities in [self.HTML_ENTITIES, | |
| 1185 | - self.XML_ENTITIES]: | |
| 1186 | - data = unichr(int(ref)) | |
| 1187 | - else: | |
| 1188 | - data = '&#%s;' % ref | |
| 1189 | - self.handle_data(data) | |
| 1190 | - | |
| 1191 | - def handle_entityref(self, ref): | |
| 1192 | - """Handle entity references as data, possibly converting known | |
| 1193 | - HTML entity references to the corresponding Unicode | |
| 1194 | - characters.""" | |
| 1195 | - data = None | |
| 1196 | - if self.convertEntities == self.HTML_ENTITIES or \ | |
| 1197 | - (self.convertEntities == self.XML_ENTITIES and \ | |
| 1198 | - self.XML_ENTITY_LIST.get(ref)): | |
| 1199 | - try: | |
| 1200 | - data = unichr(name2codepoint[ref]) | |
| 1201 | - except KeyError: | |
| 1202 | - pass | |
| 1203 | - if not data: | |
| 1204 | - data = '&%s;' % ref | |
| 1205 | - self.handle_data(data) | |
| 1206 | - | |
| 1207 | - def handle_decl(self, data): | |
| 1208 | - "Handle DOCTYPEs and the like as Declaration objects." | |
| 1209 | - self._toStringSubclass(data, Declaration) | |
| 1210 | - | |
| 1211 | - def parse_declaration(self, i): | |
| 1212 | - """Treat a bogus SGML declaration as raw data. Treat a CDATA | |
| 1213 | - declaration as a CData object.""" | |
| 1214 | - j = None | |
| 1215 | - if self.rawdata[i:i+9] == '<![CDATA[': | |
| 1216 | - k = self.rawdata.find(']]>', i) | |
| 1217 | - if k == -1: | |
| 1218 | - k = len(self.rawdata) | |
| 1219 | - data = self.rawdata[i+9:k] | |
| 1220 | - j = k+3 | |
| 1221 | - self._toStringSubclass(data, CData) | |
| 1222 | - else: | |
| 1223 | - try: | |
| 1224 | - j = SGMLParser.parse_declaration(self, i) | |
| 1225 | - except SGMLParseError: | |
| 1226 | - toHandle = self.rawdata[i:] | |
| 1227 | - self.handle_data(toHandle) | |
| 1228 | - j = i + len(toHandle) | |
| 1229 | - return j | |
| 1230 | - | |
| 1231 | -class BeautifulSoup(BeautifulStoneSoup): | |
| 1232 | - | |
| 1233 | - """This parser knows the following facts about HTML: | |
| 1234 | - | |
| 1235 | - * Some tags have no closing tag and should be interpreted as being | |
| 1236 | - closed as soon as they are encountered. | |
| 1237 | - | |
| 1238 | - * The text inside some tags (ie. 'script') may contain tags which | |
| 1239 | - are not really part of the document and which should be parsed | |
| 1240 | - as text, not tags. If you want to parse the text as tags, you can | |
| 1241 | - always fetch it and parse it explicitly. | |
| 1242 | - | |
| 1243 | - * Tag nesting rules: | |
| 1244 | - | |
| 1245 | - Most tags can't be nested at all. For instance, the occurance of | |
| 1246 | - a <p> tag should implicitly close the previous <p> tag. | |
| 1247 | - | |
| 1248 | - <p>Para1<p>Para2 | |
| 1249 | - should be transformed into: | |
| 1250 | - <p>Para1</p><p>Para2 | |
| 1251 | - | |
| 1252 | - Some tags can be nested arbitrarily. For instance, the occurance | |
| 1253 | - of a <blockquote> tag should _not_ implicitly close the previous | |
| 1254 | - <blockquote> tag. | |
| 1255 | - | |
| 1256 | - Alice said: <blockquote>Bob said: <blockquote>Blah | |
| 1257 | - should NOT be transformed into: | |
| 1258 | - Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah | |
| 1259 | - | |
| 1260 | - Some tags can be nested, but the nesting is reset by the | |
| 1261 | - interposition of other tags. For instance, a <tr> tag should | |
| 1262 | - implicitly close the previous <tr> tag within the same <table>, | |
| 1263 | - but not close a <tr> tag in another table. | |
| 1264 | - | |
| 1265 | - <table><tr>Blah<tr>Blah | |
| 1266 | - should be transformed into: | |
| 1267 | - <table><tr>Blah</tr><tr>Blah | |
| 1268 | - but, | |
| 1269 | - <tr>Blah<table><tr>Blah | |
| 1270 | - should NOT be transformed into | |
| 1271 | - <tr>Blah<table></tr><tr>Blah | |
| 1272 | - | |
| 1273 | - Differing assumptions about tag nesting rules are a major source | |
| 1274 | - of problems with the BeautifulSoup class. If BeautifulSoup is not | |
| 1275 | - treating as nestable a tag your page author treats as nestable, | |
| 1276 | - try ICantBelieveItsBeautifulSoup, MinimalSoup, or | |
| 1277 | - BeautifulStoneSoup before writing your own subclass.""" | |
| 1278 | - | |
| 1279 | - def __init__(self, *args, **kwargs): | |
| 1280 | - if not kwargs.has_key('smartQuotesTo'): | |
| 1281 | - kwargs['smartQuotesTo'] = self.HTML_ENTITIES | |
| 1282 | - BeautifulStoneSoup.__init__(self, *args, **kwargs) | |
| 1283 | - | |
| 1284 | - SELF_CLOSING_TAGS = buildTagMap(None, | |
| 1285 | - ['br' , 'hr', 'input', 'img', 'meta', | |
| 1286 | - 'spacer', 'link', 'frame', 'base']) | |
| 1287 | - | |
| 1288 | - QUOTE_TAGS = {'script': None} | |
| 1289 | - | |
| 1290 | - #According to the HTML standard, each of these inline tags can | |
| 1291 | - #contain another tag of the same type. Furthermore, it's common | |
| 1292 | - #to actually use these tags this way. | |
| 1293 | - NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', | |
| 1294 | - 'center'] | |
| 1295 | - | |
| 1296 | - #According to the HTML standard, these block tags can contain | |
| 1297 | - #another tag of the same type. Furthermore, it's common | |
| 1298 | - #to actually use these tags this way. | |
| 1299 | - NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] | |
| 1300 | - | |
| 1301 | - #Lists can contain other lists, but there are restrictions. | |
| 1302 | - NESTABLE_LIST_TAGS = { 'ol' : [], | |
| 1303 | - 'ul' : [], | |
| 1304 | - 'li' : ['ul', 'ol'], | |
| 1305 | - 'dl' : [], | |
| 1306 | - 'dd' : ['dl'], | |
| 1307 | - 'dt' : ['dl'] } | |
| 1308 | - | |
| 1309 | - #Tables can contain other tables, but there are restrictions. | |
| 1310 | - NESTABLE_TABLE_TAGS = {'table' : [], | |
| 1311 | - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], | |
| 1312 | - 'td' : ['tr'], | |
| 1313 | - 'th' : ['tr'], | |
| 1314 | - 'thead' : ['table'], | |
| 1315 | - 'tbody' : ['table'], | |
| 1316 | - 'tfoot' : ['table'], | |
| 1317 | - } | |
| 1318 | - | |
| 1319 | - NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] | |
| 1320 | - | |
| 1321 | - #If one of these tags is encountered, all tags up to the next tag of | |
| 1322 | - #this type are popped. | |
| 1323 | - RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', | |
| 1324 | - NON_NESTABLE_BLOCK_TAGS, | |
| 1325 | - NESTABLE_LIST_TAGS, | |
| 1326 | - NESTABLE_TABLE_TAGS) | |
| 1327 | - | |
| 1328 | - NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, | |
| 1329 | - NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) | |
| 1330 | - | |
| 1331 | - # Used to detect the charset in a META tag; see start_meta | |
| 1332 | - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") | |
| 1333 | - | |
| 1334 | - def start_meta(self, attrs): | |
| 1335 | - """Beautiful Soup can detect a charset included in a META tag, | |
| 1336 | - try to convert the document to that charset, and re-parse the | |
| 1337 | - document from the beginning.""" | |
| 1338 | - httpEquiv = None | |
| 1339 | - contentType = None | |
| 1340 | - contentTypeIndex = None | |
| 1341 | - tagNeedsEncodingSubstitution = False | |
| 1342 | - | |
| 1343 | - for i in range(0, len(attrs)): | |
| 1344 | - key, value = attrs[i] | |
| 1345 | - key = key.lower() | |
| 1346 | - if key == 'http-equiv': | |
| 1347 | - httpEquiv = value | |
| 1348 | - elif key == 'content': | |
| 1349 | - contentType = value | |
| 1350 | - contentTypeIndex = i | |
| 1351 | - | |
| 1352 | - if httpEquiv and contentType: # It's an interesting meta tag. | |
| 1353 | - match = self.CHARSET_RE.search(contentType) | |
| 1354 | - if match: | |
| 1355 | - if getattr(self, 'declaredHTMLEncoding') or \ | |
| 1356 | - (self.originalEncoding == self.fromEncoding): | |
| 1357 | - # This is our second pass through the document, or | |
| 1358 | - # else an encoding was specified explicitly and it | |
| 1359 | - # worked. Rewrite the meta tag. | |
| 1360 | - newAttr = self.CHARSET_RE.sub\ | |
| 1361 | - (lambda(match):match.group(1) + | |
| 1362 | - "%SOUP-ENCODING%", value) | |
| 1363 | - attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], | |
| 1364 | - newAttr) | |
| 1365 | - tagNeedsEncodingSubstitution = True | |
| 1366 | - else: | |
| 1367 | - # This is our first pass through the document. | |
| 1368 | - # Go through it again with the new information. | |
| 1369 | - newCharset = match.group(3) | |
| 1370 | - if newCharset and newCharset != self.originalEncoding: | |
| 1371 | - self.declaredHTMLEncoding = newCharset | |
| 1372 | - self._feed(self.declaredHTMLEncoding) | |
| 1373 | - raise StopParsing | |
| 1374 | - tag = self.unknown_starttag("meta", attrs) | |
| 1375 | - if tag and tagNeedsEncodingSubstitution: | |
| 1376 | - tag.containsSubstitutions = True | |
| 1377 | - | |
| 1378 | -class StopParsing(Exception): | |
| 1379 | - pass | |
| 1380 | - | |
| 1381 | -class ICantBelieveItsBeautifulSoup(BeautifulSoup): | |
| 1382 | - | |
| 1383 | - """The BeautifulSoup class is oriented towards skipping over | |
| 1384 | - common HTML errors like unclosed tags. However, sometimes it makes | |
| 1385 | - errors of its own. For instance, consider this fragment: | |
| 1386 | - | |
| 1387 | - <b>Foo<b>Bar</b></b> | |
| 1388 | - | |
| 1389 | - This is perfectly valid (if bizarre) HTML. However, the | |
| 1390 | - BeautifulSoup class will implicitly close the first b tag when it | |
| 1391 | - encounters the second 'b'. It will think the author wrote | |
| 1392 | - "<b>Foo<b>Bar", and didn't close the first 'b' tag, because | |
| 1393 | - there's no real-world reason to bold something that's already | |
| 1394 | - bold. When it encounters '</b></b>' it will close two more 'b' | |
| 1395 | - tags, for a grand total of three tags closed instead of two. This | |
| 1396 | - can throw off the rest of your document structure. The same is | |
| 1397 | - true of a number of other tags, listed below. | |
| 1398 | - | |
| 1399 | - It's much more common for someone to forget to close a 'b' tag | |
| 1400 | - than to actually use nested 'b' tags, and the BeautifulSoup class | |
| 1401 | - handles the common case. This class handles the not-co-common | |
| 1402 | - case: where you can't believe someone wrote what they did, but | |
| 1403 | - it's valid HTML and BeautifulSoup screwed up by assuming it | |
| 1404 | - wouldn't be.""" | |
| 1405 | - | |
| 1406 | - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ | |
| 1407 | - ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', | |
| 1408 | - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', | |
| 1409 | - 'big'] | |
| 1410 | - | |
| 1411 | - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] | |
| 1412 | - | |
| 1413 | - NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, | |
| 1414 | - I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, | |
| 1415 | - I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) | |
| 1416 | - | |
| 1417 | -class MinimalSoup(BeautifulSoup): | |
| 1418 | - """The MinimalSoup class is for parsing HTML that contains | |
| 1419 | - pathologically bad markup. It makes no assumptions about tag | |
| 1420 | - nesting, but it does know which tags are self-closing, that | |
| 1421 | - <script> tags contain Javascript and should not be parsed, that | |
| 1422 | - META tags may contain encoding information, and so on. | |
| 1423 | - | |
| 1424 | - This also makes it better for subclassing than BeautifulStoneSoup | |
| 1425 | - or BeautifulSoup.""" | |
| 1426 | - | |
| 1427 | - RESET_NESTING_TAGS = buildTagMap('noscript') | |
| 1428 | - NESTABLE_TAGS = {} | |
| 1429 | - | |
| 1430 | -class BeautifulSOAP(BeautifulStoneSoup): | |
| 1431 | - """This class will push a tag with only a single string child into | |
| 1432 | - the tag's parent as an attribute. The attribute's name is the tag | |
| 1433 | - name, and the value is the string child. An example should give | |
| 1434 | - the flavor of the change: | |
| 1435 | - | |
| 1436 | - <foo><bar>baz</bar></foo> | |
| 1437 | - => | |
| 1438 | - <foo bar="baz"><bar>baz</bar></foo> | |
| 1439 | - | |
| 1440 | - You can then access fooTag['bar'] instead of fooTag.barTag.string. | |
| 1441 | - | |
| 1442 | - This is, of course, useful for scraping structures that tend to | |
| 1443 | - use subelements instead of attributes, such as SOAP messages. Note | |
| 1444 | - that it modifies its input, so don't print the modified version | |
| 1445 | - out. | |
| 1446 | - | |
| 1447 | - I'm not sure how many people really want to use this class; let me | |
| 1448 | - know if you do. Mainly I like the name.""" | |
| 1449 | - | |
| 1450 | - def popTag(self): | |
| 1451 | - if len(self.tagStack) > 1: | |
| 1452 | - tag = self.tagStack[-1] | |
| 1453 | - parent = self.tagStack[-2] | |
| 1454 | - parent._getAttrMap() | |
| 1455 | - if (isinstance(tag, Tag) and len(tag.contents) == 1 and | |
| 1456 | - isinstance(tag.contents[0], NavigableString) and | |
| 1457 | - not parent.attrMap.has_key(tag.name)): | |
| 1458 | - parent[tag.name] = tag.contents[0] | |
| 1459 | - BeautifulStoneSoup.popTag(self) | |
| 1460 | - | |
| 1461 | -#Enterprise class names! It has come to our attention that some people | |
| 1462 | -#think the names of the Beautiful Soup parser classes are too silly | |
| 1463 | -#and "unprofessional" for use in enterprise screen-scraping. We feel | |
| 1464 | -#your pain! For such-minded folk, the Beautiful Soup Consortium And | |
| 1465 | -#All-Night Kosher Bakery recommends renaming this file to | |
| 1466 | -#"RobustParser.py" (or, in cases of extreme enterprisness, | |
| 1467 | -#"RobustParserBeanInterface.class") and using the following | |
| 1468 | -#enterprise-friendly class aliases: | |
| 1469 | -class RobustXMLParser(BeautifulStoneSoup): | |
| 1470 | - pass | |
| 1471 | -class RobustHTMLParser(BeautifulSoup): | |
| 1472 | - pass | |
| 1473 | -class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): | |
| 1474 | - pass | |
| 1475 | -class RobustInsanelyWackAssHTMLParser(MinimalSoup): | |
| 1476 | - pass | |
| 1477 | -class SimplifyingSOAPParser(BeautifulSOAP): | |
| 1478 | - pass | |
| 1479 | - | |
| 1480 | -###################################################### | |
| 1481 | -# | |
| 1482 | -# Bonus library: Unicode, Dammit | |
| 1483 | -# | |
| 1484 | -# This class forces XML data into a standard format (usually to UTF-8 | |
| 1485 | -# or Unicode). It is heavily based on code from Mark Pilgrim's | |
| 1486 | -# Universal Feed Parser. It does not rewrite the XML or HTML to | |
| 1487 | -# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi | |
| 1488 | -# (XML) and BeautifulSoup.start_meta (HTML). | |
| 1489 | - | |
| 1490 | -# Autodetects character encodings. | |
| 1491 | -# Download from http://chardet.feedparser.org/ | |
| 1492 | -try: | |
| 1493 | - import chardet | |
| 1494 | -# import chardet.constants | |
| 1495 | -# chardet.constants._debug = 1 | |
| 1496 | -except: | |
| 1497 | - chardet = None | |
| 1498 | -chardet = None | |
| 1499 | - | |
| 1500 | -# cjkcodecs and iconv_codec make Python know about more character encodings. | |
| 1501 | -# Both are available from http://cjkpython.i18n.org/ | |
| 1502 | -# They're built in if you use Python 2.4. | |
| 1503 | -try: | |
| 1504 | - import cjkcodecs.aliases | |
| 1505 | -except: | |
| 1506 | - pass | |
| 1507 | -try: | |
| 1508 | - import iconv_codec | |
| 1509 | -except: | |
| 1510 | - pass | |
| 1511 | - | |
| 1512 | -class UnicodeDammit: | |
| 1513 | - """A class for detecting the encoding of a *ML document and | |
| 1514 | - converting it to a Unicode string. If the source encoding is | |
| 1515 | - windows-1252, can replace MS smart quotes with their HTML or XML | |
| 1516 | - equivalents.""" | |
| 1517 | - | |
| 1518 | - # This dictionary maps commonly seen values for "charset" in HTML | |
| 1519 | - # meta tags to the corresponding Python codec names. It only covers | |
| 1520 | - # values that aren't in Python's aliases and can't be determined | |
| 1521 | - # by the heuristics in find_codec. | |
| 1522 | - CHARSET_ALIASES = { "macintosh" : "mac-roman", | |
| 1523 | - "x-sjis" : "shift-jis" } | |
| 1524 | - | |
| 1525 | - def __init__(self, markup, overrideEncodings=[], | |
| 1526 | - smartQuotesTo='xml'): | |
| 1527 | - self.markup, documentEncoding, sniffedEncoding = \ | |
| 1528 | - self._detectEncoding(markup) | |
| 1529 | - self.smartQuotesTo = smartQuotesTo | |
| 1530 | - self.triedEncodings = [] | |
| 1531 | - if markup == '' or isinstance(markup, unicode): | |
| 1532 | - self.originalEncoding = None | |
| 1533 | - self.unicode = unicode(markup) | |
| 1534 | - return | |
| 1535 | - | |
| 1536 | - u = None | |
| 1537 | - for proposedEncoding in overrideEncodings: | |
| 1538 | - u = self._convertFrom(proposedEncoding) | |
| 1539 | - if u: break | |
| 1540 | - if not u: | |
| 1541 | - for proposedEncoding in (documentEncoding, sniffedEncoding): | |
| 1542 | - u = self._convertFrom(proposedEncoding) | |
| 1543 | - if u: break | |
| 1544 | - | |
| 1545 | - # If no luck and we have auto-detection library, try that: | |
| 1546 | - if not u and chardet and not isinstance(self.markup, unicode): | |
| 1547 | - u = self._convertFrom(chardet.detect(self.markup)['encoding']) | |
| 1548 | - | |
| 1549 | - # As a last resort, try utf-8 and windows-1252: | |
| 1550 | - if not u: | |
| 1551 | - for proposed_encoding in ("utf-8", "windows-1252"): | |
| 1552 | - u = self._convertFrom(proposed_encoding) | |
| 1553 | - if u: break | |
| 1554 | - self.unicode = u | |
| 1555 | - if not u: self.originalEncoding = None | |
| 1556 | - | |
| 1557 | - def _subMSChar(self, orig): | |
| 1558 | - """Changes a MS smart quote character to an XML or HTML | |
| 1559 | - entity.""" | |
| 1560 | - sub = self.MS_CHARS.get(orig) | |
| 1561 | - if type(sub) == types.TupleType: | |
| 1562 | - if self.smartQuotesTo == 'xml': | |
| 1563 | - sub = '&#x%s;' % sub[1] | |
| 1564 | - else: | |
| 1565 | - sub = '&%s;' % sub[0] | |
| 1566 | - return sub | |
| 1567 | - | |
| 1568 | - def _convertFrom(self, proposed): | |
| 1569 | - proposed = self.find_codec(proposed) | |
| 1570 | - if not proposed or proposed in self.triedEncodings: | |
| 1571 | - return None | |
| 1572 | - self.triedEncodings.append(proposed) | |
| 1573 | - markup = self.markup | |
| 1574 | - | |
| 1575 | - # Convert smart quotes to HTML if coming from an encoding | |
| 1576 | - # that might have them. | |
| 1577 | - if self.smartQuotesTo and proposed.lower() in("windows-1252", | |
| 1578 | - "iso-8859-1", | |
| 1579 | - "iso-8859-2"): | |
| 1580 | - markup = re.compile("([\x80-\x9f])").sub \ | |
| 1581 | - (lambda(x): self._subMSChar(x.group(1)), | |
| 1582 | - markup) | |
| 1583 | - | |
| 1584 | - try: | |
| 1585 | - # print "Trying to convert document to %s" % proposed | |
| 1586 | - u = self._toUnicode(markup, proposed) | |
| 1587 | - self.markup = u | |
| 1588 | - self.originalEncoding = proposed | |
| 1589 | - except Exception, e: | |
| 1590 | - # print "That didn't work!" | |
| 1591 | - # print e | |
| 1592 | - return None | |
| 1593 | - #print "Correct encoding: %s" % proposed | |
| 1594 | - return self.markup | |
| 1595 | - | |
| 1596 | - def _toUnicode(self, data, encoding): | |
| 1597 | - '''Given a string and its encoding, decodes the string into Unicode. | |
| 1598 | - %encoding is a string recognized by encodings.aliases''' | |
| 1599 | - | |
| 1600 | - # strip Byte Order Mark (if present) | |
| 1601 | - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ | |
| 1602 | - and (data[2:4] != '\x00\x00'): | |
| 1603 | - encoding = 'utf-16be' | |
| 1604 | - data = data[2:] | |
| 1605 | - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ | |
| 1606 | - and (data[2:4] != '\x00\x00'): | |
| 1607 | - encoding = 'utf-16le' | |
| 1608 | - data = data[2:] | |
| 1609 | - elif data[:3] == '\xef\xbb\xbf': | |
| 1610 | - encoding = 'utf-8' | |
| 1611 | - data = data[3:] | |
| 1612 | - elif data[:4] == '\x00\x00\xfe\xff': | |
| 1613 | - encoding = 'utf-32be' | |
| 1614 | - data = data[4:] | |
| 1615 | - elif data[:4] == '\xff\xfe\x00\x00': | |
| 1616 | - encoding = 'utf-32le' | |
| 1617 | - data = data[4:] | |
| 1618 | - newdata = unicode(data, encoding) | |
| 1619 | - return newdata | |
| 1620 | - | |
| 1621 | - def _detectEncoding(self, xml_data): | |
| 1622 | - """Given a document, tries to detect its XML encoding.""" | |
| 1623 | - xml_encoding = sniffed_xml_encoding = None | |
| 1624 | - try: | |
| 1625 | - if xml_data[:4] == '\x4c\x6f\xa7\x94': | |
| 1626 | - # EBCDIC | |
| 1627 | - xml_data = self._ebcdic_to_ascii(xml_data) | |
| 1628 | - elif xml_data[:4] == '\x00\x3c\x00\x3f': | |
| 1629 | - # UTF-16BE | |
| 1630 | - sniffed_xml_encoding = 'utf-16be' | |
| 1631 | - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') | |
| 1632 | - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ | |
| 1633 | - and (xml_data[2:4] != '\x00\x00'): | |
| 1634 | - # UTF-16BE with BOM | |
| 1635 | - sniffed_xml_encoding = 'utf-16be' | |
| 1636 | - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') | |
| 1637 | - elif xml_data[:4] == '\x3c\x00\x3f\x00': | |
| 1638 | - # UTF-16LE | |
| 1639 | - sniffed_xml_encoding = 'utf-16le' | |
| 1640 | - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') | |
| 1641 | - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ | |
| 1642 | - (xml_data[2:4] != '\x00\x00'): | |
| 1643 | - # UTF-16LE with BOM | |
| 1644 | - sniffed_xml_encoding = 'utf-16le' | |
| 1645 | - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') | |
| 1646 | - elif xml_data[:4] == '\x00\x00\x00\x3c': | |
| 1647 | - # UTF-32BE | |
| 1648 | - sniffed_xml_encoding = 'utf-32be' | |
| 1649 | - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') | |
| 1650 | - elif xml_data[:4] == '\x3c\x00\x00\x00': | |
| 1651 | - # UTF-32LE | |
| 1652 | - sniffed_xml_encoding = 'utf-32le' | |
| 1653 | - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') | |
| 1654 | - elif xml_data[:4] == '\x00\x00\xfe\xff': | |
| 1655 | - # UTF-32BE with BOM | |
| 1656 | - sniffed_xml_encoding = 'utf-32be' | |
| 1657 | - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') | |
| 1658 | - elif xml_data[:4] == '\xff\xfe\x00\x00': | |
| 1659 | - # UTF-32LE with BOM | |
| 1660 | - sniffed_xml_encoding = 'utf-32le' | |
| 1661 | - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') | |
| 1662 | - elif xml_data[:3] == '\xef\xbb\xbf': | |
| 1663 | - # UTF-8 with BOM | |
| 1664 | - sniffed_xml_encoding = 'utf-8' | |
| 1665 | - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') | |
| 1666 | - else: | |
| 1667 | - sniffed_xml_encoding = 'ascii' | |
| 1668 | - pass | |
| 1669 | - xml_encoding_match = re.compile \ | |
| 1670 | - ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\ | |
| 1671 | - .match(xml_data) | |
| 1672 | - except: | |
| 1673 | - xml_encoding_match = None | |
| 1674 | - if xml_encoding_match: | |
| 1675 | - xml_encoding = xml_encoding_match.groups()[0].lower() | |
| 1676 | - if sniffed_xml_encoding and \ | |
| 1677 | - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', | |
| 1678 | - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', | |
| 1679 | - 'utf-16', 'utf-32', 'utf_16', 'utf_32', | |
| 1680 | - 'utf16', 'u16')): | |
| 1681 | - xml_encoding = sniffed_xml_encoding | |
| 1682 | - return xml_data, xml_encoding, sniffed_xml_encoding | |
| 1683 | - | |
| 1684 | - | |
| 1685 | - def find_codec(self, charset): | |
| 1686 | - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ | |
| 1687 | - or (charset and self._codec(charset.replace("-", ""))) \ | |
| 1688 | - or (charset and self._codec(charset.replace("-", "_"))) \ | |
| 1689 | - or charset | |
| 1690 | - | |
| 1691 | - def _codec(self, charset): | |
| 1692 | - if not charset: return charset | |
| 1693 | - codec = None | |
| 1694 | - try: | |
| 1695 | - codecs.lookup(charset) | |
| 1696 | - codec = charset | |
| 1697 | - except LookupError: | |
| 1698 | - pass | |
| 1699 | - return codec | |
| 1700 | - | |
| 1701 | - EBCDIC_TO_ASCII_MAP = None | |
| 1702 | - def _ebcdic_to_ascii(self, s): | |
| 1703 | - c = self.__class__ | |
| 1704 | - if not c.EBCDIC_TO_ASCII_MAP: | |
| 1705 | - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, | |
| 1706 | - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, | |
| 1707 | - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, | |
| 1708 | - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, | |
| 1709 | - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, | |
| 1710 | - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, | |
| 1711 | - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, | |
| 1712 | - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, | |
| 1713 | - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, | |
| 1714 | - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, | |
| 1715 | - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, | |
| 1716 | - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, | |
| 1717 | - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, | |
| 1718 | - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, | |
| 1719 | - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, | |
| 1720 | - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, | |
| 1721 | - 250,251,252,253,254,255) | |
| 1722 | - import string | |
| 1723 | - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ | |
| 1724 | - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) | |
| 1725 | - return s.translate(c.EBCDIC_TO_ASCII_MAP) | |
| 1726 | - | |
| 1727 | - MS_CHARS = { '\x80' : ('euro', '20AC'), | |
| 1728 | - '\x81' : ' ', | |
| 1729 | - '\x82' : ('sbquo', '201A'), | |
| 1730 | - '\x83' : ('fnof', '192'), | |
| 1731 | - '\x84' : ('bdquo', '201E'), | |
| 1732 | - '\x85' : ('hellip', '2026'), | |
| 1733 | - '\x86' : ('dagger', '2020'), | |
| 1734 | - '\x87' : ('Dagger', '2021'), | |
| 1735 | - '\x88' : ('circ', '2C6'), | |
| 1736 | - '\x89' : ('permil', '2030'), | |
| 1737 | - '\x8A' : ('Scaron', '160'), | |
| 1738 | - '\x8B' : ('lsaquo', '2039'), | |
| 1739 | - '\x8C' : ('OElig', '152'), | |
| 1740 | - '\x8D' : '?', | |
| 1741 | - '\x8E' : ('#x17D', '17D'), | |
| 1742 | - '\x8F' : '?', | |
| 1743 | - '\x90' : '?', | |
| 1744 | - '\x91' : ('lsquo', '2018'), | |
| 1745 | - '\x92' : ('rsquo', '2019'), | |
| 1746 | - '\x93' : ('ldquo', '201C'), | |
| 1747 | - '\x94' : ('rdquo', '201D'), | |
| 1748 | - '\x95' : ('bull', '2022'), | |
| 1749 | - '\x96' : ('ndash', '2013'), | |
| 1750 | - '\x97' : ('mdash', '2014'), | |
| 1751 | - '\x98' : ('tilde', '2DC'), | |
| 1752 | - '\x99' : ('trade', '2122'), | |
| 1753 | - '\x9a' : ('scaron', '161'), | |
| 1754 | - '\x9b' : ('rsaquo', '203A'), | |
| 1755 | - '\x9c' : ('oelig', '153'), | |
| 1756 | - '\x9d' : '?', | |
| 1757 | - '\x9e' : ('#x17E', '17E'), | |
| 1758 | - '\x9f' : ('Yuml', ''),} | |
| 1759 | - | |
| 1760 | -####################################################################### | |
| 1761 | - | |
| 1762 | - | |
| 1763 | -#By default, act as an HTML pretty-printer. | |
| 1764 | -if __name__ == '__main__': | |
| 1765 | - import sys | |
| 1766 | - soup = BeautifulSoup(sys.stdin.read()) | |
| 1767 | - print soup.prettify() |
pacotes/openlayers/tools/README.txt
| ... | ... | @@ -1,14 +0,0 @@ |
| 1 | -This directory contains tools used in the packaging or deployment of OpenLayers. | |
| 2 | - | |
| 3 | -Javascript minimizing tools: | |
| 4 | - | |
| 5 | - * jsmin.c, jsmin.py: | |
| 6 | - jsmin.py is a direct translation of the jsmin.c code into Python. jsmin.py | |
| 7 | - will therefore run anyplace Python runs... but at significantly slower speed. | |
| 8 | - | |
| 9 | - * shrinksafe.py | |
| 10 | - shrinksafe.py calls out to a third party javascript shrinking service. This | |
| 11 | - creates file sizes about 4% smaller (as of commit 501) of the OpenLayers | |
| 12 | - code. However, this also has the side effect of making you dependant on the | |
| 13 | - web service -- and since that service sometimes goes dead, it's risky to | |
| 14 | - depend on it. |
pacotes/openlayers/tools/exampleparser.py
| ... | ... | @@ -1,251 +0,0 @@ |
| 1 | -#!/usr/bin/env python | |
| 2 | - | |
| 3 | -import sys | |
| 4 | -import os | |
| 5 | -import re | |
| 6 | -import urllib2 | |
| 7 | -import time | |
| 8 | -from xml.dom.minidom import Document | |
| 9 | - | |
| 10 | -try: | |
| 11 | - import xml.etree.ElementTree as ElementTree | |
| 12 | -except ImportError: | |
| 13 | - try: | |
| 14 | - import cElementTree as ElementTree | |
| 15 | - except ImportError: | |
| 16 | - try: | |
| 17 | - import elementtree.ElementTree as ElementTree | |
| 18 | - except ImportError: | |
| 19 | - import lxml.etree as ElementTree | |
| 20 | - | |
| 21 | -missing_deps = False | |
| 22 | -try: | |
| 23 | - import simplejson | |
| 24 | - from BeautifulSoup import BeautifulSoup | |
| 25 | -except ImportError, E: | |
| 26 | - missing_deps = E | |
| 27 | - | |
| 28 | -feedName = "example-list.xml" | |
| 29 | -feedPath = "http://openlayers.org/dev/examples/" | |
| 30 | - | |
| 31 | -def getListOfOnlineExamples(baseUrl): | |
| 32 | - """ | |
| 33 | - useful if you want to get a list of examples a url. not used by default. | |
| 34 | - """ | |
| 35 | - html = urllib2.urlopen(baseUrl) | |
| 36 | - soup = BeautifulSoup(html) | |
| 37 | - examples = soup.findAll('li') | |
| 38 | - examples = [example.find('a').get('href') for example in examples] | |
| 39 | - examples = [example for example in examples if example.endswith('.html')] | |
| 40 | - examples = [example for example in examples] | |
| 41 | - return examples | |
| 42 | - | |
| 43 | -def getListOfExamples(relPath): | |
| 44 | - """ | |
| 45 | - returns list of .html filenames within a given path - excludes example-list.html | |
| 46 | - """ | |
| 47 | - examples = os.listdir(relPath) | |
| 48 | - examples = [example for example in examples if example.endswith('.html') and example != "example-list.html"] | |
| 49 | - return examples | |
| 50 | - | |
| 51 | - | |
| 52 | -def getExampleHtml(location): | |
| 53 | - """ | |
| 54 | - returns html of a specific example that is available online or locally | |
| 55 | - """ | |
| 56 | - print '.', | |
| 57 | - if location.startswith('http'): | |
| 58 | - return urllib2.urlopen(location).read() | |
| 59 | - else: | |
| 60 | - f = open(location) | |
| 61 | - html = f.read() | |
| 62 | - f.close() | |
| 63 | - return html | |
| 64 | - | |
| 65 | - | |
| 66 | -def extractById(soup, tagId, value=None): | |
| 67 | - """ | |
| 68 | - returns full contents of a particular tag id | |
| 69 | - """ | |
| 70 | - beautifulTag = soup.find(id=tagId) | |
| 71 | - if beautifulTag: | |
| 72 | - if beautifulTag.contents: | |
| 73 | - value = str(beautifulTag.renderContents()).strip() | |
| 74 | - value = value.replace('\t','') | |
| 75 | - value = value.replace('\n','') | |
| 76 | - return value | |
| 77 | - | |
| 78 | -def getRelatedClasses(html): | |
| 79 | - """ | |
| 80 | - parses the html, and returns a list of all OpenLayers Classes | |
| 81 | - used within (ie what parts of OL the javascript uses). | |
| 82 | - """ | |
| 83 | - rawstr = r'''(?P<class>OpenLayers\..*?)\(''' | |
| 84 | - return re.findall(rawstr, html) | |
| 85 | - | |
| 86 | -def parseHtml(html,ids): | |
| 87 | - """ | |
| 88 | - returns dictionary of items of interest | |
| 89 | - """ | |
| 90 | - soup = BeautifulSoup(html) | |
| 91 | - d = {} | |
| 92 | - for tagId in ids: | |
| 93 | - d[tagId] = extractById(soup,tagId) | |
| 94 | - #classes should eventually be parsed from docs - not automatically created. | |
| 95 | - classes = getRelatedClasses(html) | |
| 96 | - d['classes'] = classes | |
| 97 | - return d | |
| 98 | - | |
| 99 | -def getSvnInfo(path): | |
| 100 | - h = os.popen("svn info %s --xml" % path) | |
| 101 | - tree = ElementTree.fromstring(h.read()) | |
| 102 | - h.close() | |
| 103 | - d = { | |
| 104 | - 'url': tree.findtext('entry/url'), | |
| 105 | - 'author': tree.findtext('entry/commit/author'), | |
| 106 | - 'date': tree.findtext('entry/commit/date') | |
| 107 | - } | |
| 108 | - return d | |
| 109 | - | |
| 110 | -def createFeed(examples): | |
| 111 | - doc = Document() | |
| 112 | - atomuri = "http://www.w3.org/2005/Atom" | |
| 113 | - feed = doc.createElementNS(atomuri, "feed") | |
| 114 | - feed.setAttribute("xmlns", atomuri) | |
| 115 | - title = doc.createElementNS(atomuri, "title") | |
| 116 | - title.appendChild(doc.createTextNode("OpenLayers Examples")) | |
| 117 | - feed.appendChild(title) | |
| 118 | - link = doc.createElementNS(atomuri, "link") | |
| 119 | - link.setAttribute("rel", "self") | |
| 120 | - link.setAttribute("href", feedPath + feedName) | |
| 121 | - | |
| 122 | - modtime = time.strftime("%Y-%m-%dT%I:%M:%SZ", time.gmtime()) | |
| 123 | - id = doc.createElementNS(atomuri, "id") | |
| 124 | - id.appendChild(doc.createTextNode("%s%s#%s" % (feedPath, feedName, modtime))) | |
| 125 | - feed.appendChild(id) | |
| 126 | - | |
| 127 | - updated = doc.createElementNS(atomuri, "updated") | |
| 128 | - updated.appendChild(doc.createTextNode(modtime)) | |
| 129 | - feed.appendChild(updated) | |
| 130 | - | |
| 131 | - examples.sort(key=lambda x:x["modified"]) | |
| 132 | - for example in sorted(examples, key=lambda x:x["modified"], reverse=True): | |
| 133 | - entry = doc.createElementNS(atomuri, "entry") | |
| 134 | - | |
| 135 | - title = doc.createElementNS(atomuri, "title") | |
| 136 | - title.appendChild(doc.createTextNode(example["title"] or example["example"])) | |
| 137 | - entry.appendChild(title) | |
| 138 | - | |
| 139 | - link = doc.createElementNS(atomuri, "link") | |
| 140 | - link.setAttribute("href", "%s%s" % (feedPath, example["example"])) | |
| 141 | - entry.appendChild(link) | |
| 142 | - | |
| 143 | - summary = doc.createElementNS(atomuri, "summary") | |
| 144 | - summary.appendChild(doc.createTextNode(example["shortdesc"] or example["example"])) | |
| 145 | - entry.appendChild(summary) | |
| 146 | - | |
| 147 | - updated = doc.createElementNS(atomuri, "updated") | |
| 148 | - updated.appendChild(doc.createTextNode(example["modified"])) | |
| 149 | - entry.appendChild(updated) | |
| 150 | - | |
| 151 | - author = doc.createElementNS(atomuri, "author") | |
| 152 | - name = doc.createElementNS(atomuri, "name") | |
| 153 | - name.appendChild(doc.createTextNode(example["author"])) | |
| 154 | - author.appendChild(name) | |
| 155 | - entry.appendChild(author) | |
| 156 | - | |
| 157 | - id = doc.createElementNS(atomuri, "id") | |
| 158 | - id.appendChild(doc.createTextNode("%s%s#%s" % (feedPath, example["example"], example["modified"]))) | |
| 159 | - entry.appendChild(id) | |
| 160 | - | |
| 161 | - feed.appendChild(entry) | |
| 162 | - | |
| 163 | - doc.appendChild(feed) | |
| 164 | - return doc | |
| 165 | - | |
| 166 | -def wordIndex(examples): | |
| 167 | - """ | |
| 168 | - Create an inverted index based on words in title and shortdesc. Keys are | |
| 169 | - lower cased words. Values are dictionaries with example index keys and | |
| 170 | - count values. | |
| 171 | - """ | |
| 172 | - index = {} | |
| 173 | - unword = re.compile("\\W+") | |
| 174 | - keys = ["shortdesc", "title"] | |
| 175 | - for i in range(len(examples)): | |
| 176 | - for key in keys: | |
| 177 | - text = examples[i][key] | |
| 178 | - if text: | |
| 179 | - words = unword.split(text) | |
| 180 | - for word in words: | |
| 181 | - if word: | |
| 182 | - word = word.lower() | |
| 183 | - if index.has_key(word): | |
| 184 | - if index[word].has_key(i): | |
| 185 | - index[word][i] += 1 | |
| 186 | - else: | |
| 187 | - index[word][i] = 1 | |
| 188 | - else: | |
| 189 | - index[word] = {i: 1} | |
| 190 | - return index | |
| 191 | - | |
| 192 | -if __name__ == "__main__": | |
| 193 | - | |
| 194 | - if missing_deps: | |
| 195 | - print "This script requires simplejson and BeautifulSoup. You don't have them. \n(%s)" % E | |
| 196 | - sys.exit() | |
| 197 | - | |
| 198 | - if len(sys.argv) > 1: | |
| 199 | - outFile = open(sys.argv[1],'w') | |
| 200 | - else: | |
| 201 | - outFile = open('../examples/example-list.js','w') | |
| 202 | - | |
| 203 | - examplesLocation = '../examples' | |
| 204 | - print 'Reading examples from %s and writing out to %s' % (examplesLocation, outFile.name) | |
| 205 | - | |
| 206 | - exampleList = [] | |
| 207 | - docIds = ['title','shortdesc'] | |
| 208 | - | |
| 209 | - #comment out option to create docs from online resource | |
| 210 | - #examplesLocation = 'http://svn.openlayers.org/sandbox/docs/examples/' | |
| 211 | - #examples = getListOfOnlineExamples(examplesLocation) | |
| 212 | - | |
| 213 | - examples = getListOfExamples(examplesLocation) | |
| 214 | - | |
| 215 | - modtime = time.strftime("%Y-%m-%dT%I:%M:%SZ", time.gmtime()) | |
| 216 | - | |
| 217 | - for example in examples: | |
| 218 | - url = os.path.join(examplesLocation,example) | |
| 219 | - html = getExampleHtml(url) | |
| 220 | - tagvalues = parseHtml(html,docIds) | |
| 221 | - tagvalues['example'] = example | |
| 222 | - # add in svn info | |
| 223 | - d = getSvnInfo(url) | |
| 224 | - tagvalues["modified"] = d["date"] or modtime | |
| 225 | - tagvalues["author"] = d["author"] or "anonymous" | |
| 226 | - tagvalues['link'] = example | |
| 227 | - | |
| 228 | - exampleList.append(tagvalues) | |
| 229 | - | |
| 230 | ||
| 231 | - | |
| 232 | - exampleList.sort(key=lambda x:x['example'].lower()) | |
| 233 | - | |
| 234 | - index = wordIndex(exampleList) | |
| 235 | - | |
| 236 | - json = simplejson.dumps({"examples": exampleList, "index": index}) | |
| 237 | - #give the json a global variable we can use in our js. This should be replaced or made optional. | |
| 238 | - json = 'var info=' + json | |
| 239 | - outFile.write(json) | |
| 240 | - outFile.close() | |
| 241 | - | |
| 242 | - print "writing feed to ../examples/%s " % feedName | |
| 243 | - atom = open('../examples/%s' % feedName, 'w') | |
| 244 | - doc = createFeed(exampleList) | |
| 245 | - atom.write(doc.toxml()) | |
| 246 | - atom.close() | |
| 247 | - | |
| 248 | - | |
| 249 | - print 'complete' | |
| 250 | - | |
| 251 | - |
pacotes/openlayers/tools/jsmin.c
| ... | ... | @@ -1,272 +0,0 @@ |
| 1 | -/* jsmin.c | |
| 2 | - 2006-05-04 | |
| 3 | - | |
| 4 | -Copyright (c) 2002 Douglas Crockford (www.crockford.com) | |
| 5 | - | |
| 6 | -Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| 7 | -this software and associated documentation files (the "Software"), to deal in | |
| 8 | -the Software without restriction, including without limitation the rights to | |
| 9 | -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
| 10 | -of the Software, and to permit persons to whom the Software is furnished to do | |
| 11 | -so, subject to the following conditions: | |
| 12 | - | |
| 13 | -The above copyright notice and this permission notice shall be included in all | |
| 14 | -copies or substantial portions of the Software. | |
| 15 | - | |
| 16 | -The Software shall be used for Good, not Evil. | |
| 17 | - | |
| 18 | -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 19 | -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 20 | -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 21 | -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 22 | -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 23 | -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 24 | -SOFTWARE. | |
| 25 | -*/ | |
| 26 | - | |
| 27 | -#include <stdlib.h> | |
| 28 | -#include <stdio.h> | |
| 29 | - | |
| 30 | -static int theA; | |
| 31 | -static int theB; | |
| 32 | -static int theLookahead = EOF; | |
| 33 | - | |
| 34 | - | |
| 35 | -/* isAlphanum -- return true if the character is a letter, digit, underscore, | |
| 36 | - dollar sign, or non-ASCII character. | |
| 37 | -*/ | |
| 38 | - | |
| 39 | -static int | |
| 40 | -isAlphanum(int c) | |
| 41 | -{ | |
| 42 | - return ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || | |
| 43 | - (c >= 'A' && c <= 'Z') || c == '_' || c == '$' || c == '\\' || | |
| 44 | - c > 126); | |
| 45 | -} | |
| 46 | - | |
| 47 | - | |
| 48 | -/* get -- return the next character from stdin. Watch out for lookahead. If | |
| 49 | - the character is a control character, translate it to a space or | |
| 50 | - linefeed. | |
| 51 | -*/ | |
| 52 | - | |
| 53 | -static int | |
| 54 | -get() | |
| 55 | -{ | |
| 56 | - int c = theLookahead; | |
| 57 | - theLookahead = EOF; | |
| 58 | - if (c == EOF) { | |
| 59 | - c = getc(stdin); | |
| 60 | - } | |
| 61 | - if (c >= ' ' || c == '\n' || c == EOF) { | |
| 62 | - return c; | |
| 63 | - } | |
| 64 | - if (c == '\r') { | |
| 65 | - return '\n'; | |
| 66 | - } | |
| 67 | - return ' '; | |
| 68 | -} | |
| 69 | - | |
| 70 | - | |
| 71 | -/* peek -- get the next character without getting it. | |
| 72 | -*/ | |
| 73 | - | |
| 74 | -static int | |
| 75 | -peek() | |
| 76 | -{ | |
| 77 | - theLookahead = get(); | |
| 78 | - return theLookahead; | |
| 79 | -} | |
| 80 | - | |
| 81 | - | |
| 82 | -/* next -- get the next character, excluding comments. peek() is used to see | |
| 83 | - if a '/' is followed by a '/' or '*'. | |
| 84 | -*/ | |
| 85 | - | |
| 86 | -static int | |
| 87 | -next() | |
| 88 | -{ | |
| 89 | - int c = get(); | |
| 90 | - if (c == '/') { | |
| 91 | - switch (peek()) { | |
| 92 | - case '/': | |
| 93 | - for (;;) { | |
| 94 | - c = get(); | |
| 95 | - if (c <= '\n') { | |
| 96 | - return c; | |
| 97 | - } | |
| 98 | - } | |
| 99 | - case '*': | |
| 100 | - get(); | |
| 101 | - for (;;) { | |
| 102 | - switch (get()) { | |
| 103 | - case '*': | |
| 104 | - if (peek() == '/') { | |
| 105 | - get(); | |
| 106 | - return ' '; | |
| 107 | - } | |
| 108 | - break; | |
| 109 | - case EOF: | |
| 110 | - fprintf(stderr, "Error: JSMIN Unterminated comment.\n"); | |
| 111 | - exit(1); | |
| 112 | - } | |
| 113 | - } | |
| 114 | - default: | |
| 115 | - return c; | |
| 116 | - } | |
| 117 | - } | |
| 118 | - return c; | |
| 119 | -} | |
| 120 | - | |
| 121 | - | |
| 122 | -/* action -- do something! What you do is determined by the argument: | |
| 123 | - 1 Output A. Copy B to A. Get the next B. | |
| 124 | - 2 Copy B to A. Get the next B. (Delete A). | |
| 125 | - 3 Get the next B. (Delete B). | |
| 126 | - action treats a string as a single character. Wow! | |
| 127 | - action recognizes a regular expression if it is preceded by ( or , or =. | |
| 128 | -*/ | |
| 129 | - | |
| 130 | -static void | |
| 131 | -action(int d) | |
| 132 | -{ | |
| 133 | - switch (d) { | |
| 134 | - case 1: | |
| 135 | - putc(theA, stdout); | |
| 136 | - case 2: | |
| 137 | - theA = theB; | |
| 138 | - if (theA == '\'' || theA == '"') { | |
| 139 | - for (;;) { | |
| 140 | - putc(theA, stdout); | |
| 141 | - theA = get(); | |
| 142 | - if (theA == theB) { | |
| 143 | - break; | |
| 144 | - } | |
| 145 | - if (theA <= '\n') { | |
| 146 | - fprintf(stderr, | |
| 147 | -"Error: JSMIN unterminated string literal: %c\n", theA); | |
| 148 | - exit(1); | |
| 149 | - } | |
| 150 | - if (theA == '\\') { | |
| 151 | - putc(theA, stdout); | |
| 152 | - theA = get(); | |
| 153 | - } | |
| 154 | - } | |
| 155 | - } | |
| 156 | - case 3: | |
| 157 | - theB = next(); | |
| 158 | - if (theB == '/' && (theA == '(' || theA == ',' || theA == '=' || | |
| 159 | - theA == ':' || theA == '[' || theA == '!' || theA == '&' || | |
| 160 | - theA == '|')) { | |
| 161 | - putc(theA, stdout); | |
| 162 | - putc(theB, stdout); | |
| 163 | - for (;;) { | |
| 164 | - theA = get(); | |
| 165 | - if (theA == '/') { | |
| 166 | - break; | |
| 167 | - } else if (theA =='\\') { | |
| 168 | - putc(theA, stdout); | |
| 169 | - theA = get(); | |
| 170 | - } else if (theA <= '\n') { | |
| 171 | - fprintf(stderr, | |
| 172 | -"Error: JSMIN unterminated Regular Expression literal.\n", theA); | |
| 173 | - exit(1); | |
| 174 | - } | |
| 175 | - putc(theA, stdout); | |
| 176 | - } | |
| 177 | - theB = next(); | |
| 178 | - } | |
| 179 | - } | |
| 180 | -} | |
| 181 | - | |
| 182 | - | |
| 183 | -/* jsmin -- Copy the input to the output, deleting the characters which are | |
| 184 | - insignificant to JavaScript. Comments will be removed. Tabs will be | |
| 185 | - replaced with spaces. Carriage returns will be replaced with linefeeds. | |
| 186 | - Most spaces and linefeeds will be removed. | |
| 187 | -*/ | |
| 188 | - | |
| 189 | -static void | |
| 190 | -jsmin() | |
| 191 | -{ | |
| 192 | - theA = '\n'; | |
| 193 | - action(3); | |
| 194 | - while (theA != EOF) { | |
| 195 | - switch (theA) { | |
| 196 | - case ' ': | |
| 197 | - if (isAlphanum(theB)) { | |
| 198 | - action(1); | |
| 199 | - } else { | |
| 200 | - action(2); | |
| 201 | - } | |
| 202 | - break; | |
| 203 | - case '\n': | |
| 204 | - switch (theB) { | |
| 205 | - case '{': | |
| 206 | - case '[': | |
| 207 | - case '(': | |
| 208 | - case '+': | |
| 209 | - case '-': | |
| 210 | - action(1); | |
| 211 | - break; | |
| 212 | - case ' ': | |
| 213 | - action(3); | |
| 214 | - break; | |
| 215 | - default: | |
| 216 | - if (isAlphanum(theB)) { | |
| 217 | - action(1); | |
| 218 | - } else { | |
| 219 | - action(2); | |
| 220 | - } | |
| 221 | - } | |
| 222 | - break; | |
| 223 | - default: | |
| 224 | - switch (theB) { | |
| 225 | - case ' ': | |
| 226 | - if (isAlphanum(theA)) { | |
| 227 | - action(1); | |
| 228 | - break; | |
| 229 | - } | |
| 230 | - action(3); | |
| 231 | - break; | |
| 232 | - case '\n': | |
| 233 | - switch (theA) { | |
| 234 | - case '}': | |
| 235 | - case ']': | |
| 236 | - case ')': | |
| 237 | - case '+': | |
| 238 | - case '-': | |
| 239 | - case '"': | |
| 240 | - case '\'': | |
| 241 | - action(1); | |
| 242 | - break; | |
| 243 | - default: | |
| 244 | - if (isAlphanum(theA)) { | |
| 245 | - action(1); | |
| 246 | - } else { | |
| 247 | - action(3); | |
| 248 | - } | |
| 249 | - } | |
| 250 | - break; | |
| 251 | - default: | |
| 252 | - action(1); | |
| 253 | - break; | |
| 254 | - } | |
| 255 | - } | |
| 256 | - } | |
| 257 | -} | |
| 258 | - | |
| 259 | - | |
| 260 | -/* main -- Output any command line arguments as comments | |
| 261 | - and then minify the input. | |
| 262 | -*/ | |
| 263 | -extern int | |
| 264 | -main(int argc, char* argv[]) | |
| 265 | -{ | |
| 266 | - int i; | |
| 267 | - for (i = 1; i < argc; i += 1) { | |
| 268 | - fprintf(stdout, "// %s\n", argv[i]); | |
| 269 | - } | |
| 270 | - jsmin(); | |
| 271 | - return 0; | |
| 272 | -} |
pacotes/openlayers/tools/jsmin.py
| ... | ... | @@ -1,216 +0,0 @@ |
| 1 | -#!/usr/bin/python | |
| 2 | - | |
| 3 | -# This code is original from jsmin by Douglas Crockford, it was translated to | |
| 4 | -# Python by Baruch Even. The original code had the following copyright and | |
| 5 | -# license. | |
| 6 | -# | |
| 7 | -# /* jsmin.c | |
| 8 | -# 2007-01-08 | |
| 9 | -# | |
| 10 | -# Copyright (c) 2002 Douglas Crockford (www.crockford.com) | |
| 11 | -# | |
| 12 | -# Permission is hereby granted, free of charge, to any person obtaining a copy of | |
| 13 | -# this software and associated documentation files (the "Software"), to deal in | |
| 14 | -# the Software without restriction, including without limitation the rights to | |
| 15 | -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
| 16 | -# of the Software, and to permit persons to whom the Software is furnished to do | |
| 17 | -# so, subject to the following conditions: | |
| 18 | -# | |
| 19 | -# The above copyright notice and this permission notice shall be included in all | |
| 20 | -# copies or substantial portions of the Software. | |
| 21 | -# | |
| 22 | -# The Software shall be used for Good, not Evil. | |
| 23 | -# | |
| 24 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 25 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 26 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 27 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 28 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 29 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 30 | -# SOFTWARE. | |
| 31 | -# */ | |
| 32 | - | |
| 33 | -from StringIO import StringIO | |
| 34 | - | |
| 35 | -def jsmin(js): | |
| 36 | - ins = StringIO(js) | |
| 37 | - outs = StringIO() | |
| 38 | - JavascriptMinify().minify(ins, outs) | |
| 39 | - str = outs.getvalue() | |
| 40 | - if len(str) > 0 and str[0] == '\n': | |
| 41 | - str = str[1:] | |
| 42 | - return str | |
| 43 | - | |
| 44 | -def isAlphanum(c): | |
| 45 | - """return true if the character is a letter, digit, underscore, | |
| 46 | - dollar sign, or non-ASCII character. | |
| 47 | - """ | |
| 48 | - return ((c >= 'a' and c <= 'z') or (c >= '0' and c <= '9') or | |
| 49 | - (c >= 'A' and c <= 'Z') or c == '_' or c == '$' or c == '\\' or (c is not None and ord(c) > 126)); | |
| 50 | - | |
| 51 | -class UnterminatedComment(Exception): | |
| 52 | - pass | |
| 53 | - | |
| 54 | -class UnterminatedStringLiteral(Exception): | |
| 55 | - pass | |
| 56 | - | |
| 57 | -class UnterminatedRegularExpression(Exception): | |
| 58 | - pass | |
| 59 | - | |
| 60 | -class JavascriptMinify(object): | |
| 61 | - | |
| 62 | - def _outA(self): | |
| 63 | - self.outstream.write(self.theA) | |
| 64 | - def _outB(self): | |
| 65 | - self.outstream.write(self.theB) | |
| 66 | - | |
| 67 | - def _get(self): | |
| 68 | - """return the next character from stdin. Watch out for lookahead. If | |
| 69 | - the character is a control character, translate it to a space or | |
| 70 | - linefeed. | |
| 71 | - """ | |
| 72 | - c = self.theLookahead | |
| 73 | - self.theLookahead = None | |
| 74 | - if c == None: | |
| 75 | - c = self.instream.read(1) | |
| 76 | - if c >= ' ' or c == '\n': | |
| 77 | - return c | |
| 78 | - if c == '': # EOF | |
| 79 | - return '\000' | |
| 80 | - if c == '\r': | |
| 81 | - return '\n' | |
| 82 | - return ' ' | |
| 83 | - | |
| 84 | - def _peek(self): | |
| 85 | - self.theLookahead = self._get() | |
| 86 | - return self.theLookahead | |
| 87 | - | |
| 88 | - def _next(self): | |
| 89 | - """get the next character, excluding comments. peek() is used to see | |
| 90 | - if a '/' is followed by a '/' or '*'. | |
| 91 | - """ | |
| 92 | - c = self._get() | |
| 93 | - if c == '/': | |
| 94 | - p = self._peek() | |
| 95 | - if p == '/': | |
| 96 | - c = self._get() | |
| 97 | - while c > '\n': | |
| 98 | - c = self._get() | |
| 99 | - return c | |
| 100 | - if p == '*': | |
| 101 | - c = self._get() | |
| 102 | - while 1: | |
| 103 | - c = self._get() | |
| 104 | - if c == '*': | |
| 105 | - if self._peek() == '/': | |
| 106 | - self._get() | |
| 107 | - return ' ' | |
| 108 | - if c == '\000': | |
| 109 | - raise UnterminatedComment() | |
| 110 | - | |
| 111 | - return c | |
| 112 | - | |
| 113 | - def _action(self, action): | |
| 114 | - """do something! What you do is determined by the argument: | |
| 115 | - 1 Output A. Copy B to A. Get the next B. | |
| 116 | - 2 Copy B to A. Get the next B. (Delete A). | |
| 117 | - 3 Get the next B. (Delete B). | |
| 118 | - action treats a string as a single character. Wow! | |
| 119 | - action recognizes a regular expression if it is preceded by ( or , or =. | |
| 120 | - """ | |
| 121 | - if action <= 1: | |
| 122 | - self._outA() | |
| 123 | - | |
| 124 | - if action <= 2: | |
| 125 | - self.theA = self.theB | |
| 126 | - if self.theA == "'" or self.theA == '"': | |
| 127 | - while 1: | |
| 128 | - self._outA() | |
| 129 | - self.theA = self._get() | |
| 130 | - if self.theA == self.theB: | |
| 131 | - break | |
| 132 | - if self.theA <= '\n': | |
| 133 | - raise UnterminatedStringLiteral() | |
| 134 | - if self.theA == '\\': | |
| 135 | - self._outA() | |
| 136 | - self.theA = self._get() | |
| 137 | - | |
| 138 | - | |
| 139 | - if action <= 3: | |
| 140 | - self.theB = self._next() | |
| 141 | - if self.theB == '/' and (self.theA == '(' or self.theA == ',' or | |
| 142 | - self.theA == '=' or self.theA == ':' or | |
| 143 | - self.theA == '[' or self.theA == '?' or | |
| 144 | - self.theA == '!' or self.theA == '&' or | |
| 145 | - self.theA == '|'): | |
| 146 | - self._outA() | |
| 147 | - self._outB() | |
| 148 | - while 1: | |
| 149 | - self.theA = self._get() | |
| 150 | - if self.theA == '/': | |
| 151 | - break | |
| 152 | - elif self.theA == '\\': | |
| 153 | - self._outA() | |
| 154 | - self.theA = self._get() | |
| 155 | - elif self.theA <= '\n': | |
| 156 | - raise UnterminatedRegularExpression() | |
| 157 | - self._outA() | |
| 158 | - self.theB = self._next() | |
| 159 | - | |
| 160 | - | |
| 161 | - def _jsmin(self): | |
| 162 | - """Copy the input to the output, deleting the characters which are | |
| 163 | - insignificant to JavaScript. Comments will be removed. Tabs will be | |
| 164 | - replaced with spaces. Carriage returns will be replaced with linefeeds. | |
| 165 | - Most spaces and linefeeds will be removed. | |
| 166 | - """ | |
| 167 | - self.theA = '\n' | |
| 168 | - self._action(3) | |
| 169 | - | |
| 170 | - while self.theA != '\000': | |
| 171 | - if self.theA == ' ': | |
| 172 | - if isAlphanum(self.theB): | |
| 173 | - self._action(1) | |
| 174 | - else: | |
| 175 | - self._action(2) | |
| 176 | - elif self.theA == '\n': | |
| 177 | - if self.theB in ['{', '[', '(', '+', '-']: | |
| 178 | - self._action(1) | |
| 179 | - elif self.theB == ' ': | |
| 180 | - self._action(3) | |
| 181 | - else: | |
| 182 | - if isAlphanum(self.theB): | |
| 183 | - self._action(1) | |
| 184 | - else: | |
| 185 | - self._action(2) | |
| 186 | - else: | |
| 187 | - if self.theB == ' ': | |
| 188 | - if isAlphanum(self.theA): | |
| 189 | - self._action(1) | |
| 190 | - else: | |
| 191 | - self._action(3) | |
| 192 | - elif self.theB == '\n': | |
| 193 | - if self.theA in ['}', ']', ')', '+', '-', '"', '\'']: | |
| 194 | - self._action(1) | |
| 195 | - else: | |
| 196 | - if isAlphanum(self.theA): | |
| 197 | - self._action(1) | |
| 198 | - else: | |
| 199 | - self._action(3) | |
| 200 | - else: | |
| 201 | - self._action(1) | |
| 202 | - | |
| 203 | - def minify(self, instream, outstream): | |
| 204 | - self.instream = instream | |
| 205 | - self.outstream = outstream | |
| 206 | - self.theA = None | |
| 207 | - self.thaB = None | |
| 208 | - self.theLookahead = None | |
| 209 | - | |
| 210 | - self._jsmin() | |
| 211 | - self.instream.close() | |
| 212 | - | |
| 213 | -if __name__ == '__main__': | |
| 214 | - import sys | |
| 215 | - jsm = JavascriptMinify() | |
| 216 | - jsm.minify(sys.stdin, sys.stdout) |
pacotes/openlayers/tools/mergejs.py
| ... | ... | @@ -1,252 +0,0 @@ |
| 1 | -#!/usr/bin/env python | |
| 2 | -# | |
| 3 | -# Merge multiple JavaScript source code files into one. | |
| 4 | -# | |
| 5 | -# Usage: | |
| 6 | -# This script requires source files to have dependencies specified in them. | |
| 7 | -# | |
| 8 | -# Dependencies are specified with a comment of the form: | |
| 9 | -# | |
| 10 | -# // @requires <file path> | |
| 11 | -# | |
| 12 | -# e.g. | |
| 13 | -# | |
| 14 | -# // @requires Geo/DataSource.js | |
| 15 | -# | |
| 16 | -# This script should be executed like so: | |
| 17 | -# | |
| 18 | -# mergejs.py <output.js> <directory> [...] | |
| 19 | -# | |
| 20 | -# e.g. | |
| 21 | -# | |
| 22 | -# mergejs.py openlayers.js Geo/ CrossBrowser/ | |
| 23 | -# | |
| 24 | -# This example will cause the script to walk the `Geo` and | |
| 25 | -# `CrossBrowser` directories--and subdirectories thereof--and import | |
| 26 | -# all `*.js` files encountered. The dependency declarations will be extracted | |
| 27 | -# and then the source code from imported files will be output to | |
| 28 | -# a file named `openlayers.js` in an order which fulfils the dependencies | |
| 29 | -# specified. | |
| 30 | -# | |
| 31 | -# | |
| 32 | -# Note: This is a very rough initial version of this code. | |
| 33 | -# | |
| 34 | -# -- Copyright 2005-2008 MetaCarta, Inc. / OpenLayers project -- | |
| 35 | -# | |
| 36 | - | |
| 37 | -# TODO: Allow files to be excluded. e.g. `Crossbrowser/DebugMode.js`? | |
| 38 | -# TODO: Report error when dependency can not be found rather than KeyError. | |
| 39 | - | |
| 40 | -import re | |
| 41 | -import os | |
| 42 | -import sys | |
| 43 | - | |
| 44 | -SUFFIX_JAVASCRIPT = ".js" | |
| 45 | - | |
| 46 | -RE_REQUIRE = "@requires:? (.*)\n" # TODO: Ensure in comment? | |
| 47 | -class SourceFile: | |
| 48 | - """ | |
| 49 | - Represents a Javascript source code file. | |
| 50 | - """ | |
| 51 | - | |
| 52 | - def __init__(self, filepath, source): | |
| 53 | - """ | |
| 54 | - """ | |
| 55 | - self.filepath = filepath | |
| 56 | - self.source = source | |
| 57 | - | |
| 58 | - self.requiredBy = [] | |
| 59 | - | |
| 60 | - | |
| 61 | - def _getRequirements(self): | |
| 62 | - """ | |
| 63 | - Extracts the dependencies specified in the source code and returns | |
| 64 | - a list of them. | |
| 65 | - """ | |
| 66 | - # TODO: Cache? | |
| 67 | - return re.findall(RE_REQUIRE, self.source) | |
| 68 | - | |
| 69 | - requires = property(fget=_getRequirements, doc="") | |
| 70 | - | |
| 71 | - | |
| 72 | - | |
| 73 | -def usage(filename): | |
| 74 | - """ | |
| 75 | - Displays a usage message. | |
| 76 | - """ | |
| 77 | - print "%s [-c <config file>] <output.js> <directory> [...]" % filename | |
| 78 | - | |
| 79 | - | |
| 80 | -class Config: | |
| 81 | - """ | |
| 82 | - Represents a parsed configuration file. | |
| 83 | - | |
| 84 | - A configuration file should be of the following form: | |
| 85 | - | |
| 86 | - [first] | |
| 87 | - 3rd/prototype.js | |
| 88 | - core/application.js | |
| 89 | - core/params.js | |
| 90 | - # A comment | |
| 91 | - | |
| 92 | - [last] | |
| 93 | - core/api.js # Another comment | |
| 94 | - | |
| 95 | - [exclude] | |
| 96 | - 3rd/logger.js | |
| 97 | - | |
| 98 | - All headings are required. | |
| 99 | - | |
| 100 | - The files listed in the `first` section will be forced to load | |
| 101 | - *before* all other files (in the order listed). The files in `last` | |
| 102 | - section will be forced to load *after* all the other files (in the | |
| 103 | - order listed). | |
| 104 | - | |
| 105 | - The files list in the `exclude` section will not be imported. | |
| 106 | - | |
| 107 | - Any text appearing after a # symbol indicates a comment. | |
| 108 | - | |
| 109 | - """ | |
| 110 | - | |
| 111 | - def __init__(self, filename): | |
| 112 | - """ | |
| 113 | - Parses the content of the named file and stores the values. | |
| 114 | - """ | |
| 115 | - lines = [re.sub("#.*?$", "", line).strip() # Assumes end-of-line character is present | |
| 116 | - for line in open(filename) | |
| 117 | - if line.strip() and not line.strip().startswith("#")] # Skip blank lines and comments | |
| 118 | - | |
| 119 | - self.forceFirst = lines[lines.index("[first]") + 1:lines.index("[last]")] | |
| 120 | - | |
| 121 | - self.forceLast = lines[lines.index("[last]") + 1:lines.index("[include]")] | |
| 122 | - self.include = lines[lines.index("[include]") + 1:lines.index("[exclude]")] | |
| 123 | - self.exclude = lines[lines.index("[exclude]") + 1:] | |
| 124 | - | |
| 125 | -def run (sourceDirectory, outputFilename = None, configFile = None): | |
| 126 | - cfg = None | |
| 127 | - if configFile: | |
| 128 | - cfg = Config(configFile) | |
| 129 | - | |
| 130 | - allFiles = [] | |
| 131 | - | |
| 132 | - ## Find all the Javascript source files | |
| 133 | - for root, dirs, files in os.walk(sourceDirectory): | |
| 134 | - for filename in files: | |
| 135 | - if filename.endswith(SUFFIX_JAVASCRIPT) and not filename.startswith("."): | |
| 136 | - filepath = os.path.join(root, filename)[len(sourceDirectory)+1:] | |
| 137 | - filepath = filepath.replace("\\", "/") | |
| 138 | - if cfg and cfg.include: | |
| 139 | - if filepath in cfg.include or filepath in cfg.forceFirst: | |
| 140 | - allFiles.append(filepath) | |
| 141 | - elif (not cfg) or (filepath not in cfg.exclude): | |
| 142 | - allFiles.append(filepath) | |
| 143 | - | |
| 144 | - ## Header inserted at the start of each file in the output | |
| 145 | - HEADER = "/* " + "=" * 70 + "\n %s\n" + " " + "=" * 70 + " */\n\n" | |
| 146 | - | |
| 147 | - files = {} | |
| 148 | - | |
| 149 | - order = [] # List of filepaths to output, in a dependency satisfying order | |
| 150 | - | |
| 151 | - ## Import file source code | |
| 152 | - ## TODO: Do import when we walk the directories above? | |
| 153 | - for filepath in allFiles: | |
| 154 | - print "Importing: %s" % filepath | |
| 155 | - fullpath = os.path.join(sourceDirectory, filepath).strip() | |
| 156 | - content = open(fullpath, "U").read() # TODO: Ensure end of line @ EOF? | |
| 157 | - files[filepath] = SourceFile(filepath, content) # TODO: Chop path? | |
| 158 | - | |
| 159 | ||
| 160 | - | |
| 161 | - from toposort import toposort | |
| 162 | - | |
| 163 | - complete = False | |
| 164 | - resolution_pass = 1 | |
| 165 | - | |
| 166 | - while not complete: | |
| 167 | - order = [] # List of filepaths to output, in a dependency satisfying order | |
| 168 | - nodes = [] | |
| 169 | - routes = [] | |
| 170 | - ## Resolve the dependencies | |
| 171 | - print "Resolution pass %s... " % resolution_pass | |
| 172 | - resolution_pass += 1 | |
| 173 | - | |
| 174 | - for filepath, info in files.items(): | |
| 175 | - nodes.append(filepath) | |
| 176 | - for neededFilePath in info.requires: | |
| 177 | - routes.append((neededFilePath, filepath)) | |
| 178 | - | |
| 179 | - for dependencyLevel in toposort(nodes, routes): | |
| 180 | - for filepath in dependencyLevel: | |
| 181 | - order.append(filepath) | |
| 182 | - if not files.has_key(filepath): | |
| 183 | - print "Importing: %s" % filepath | |
| 184 | - fullpath = os.path.join(sourceDirectory, filepath).strip() | |
| 185 | - content = open(fullpath, "U").read() # TODO: Ensure end of line @ EOF? | |
| 186 | - files[filepath] = SourceFile(filepath, content) # TODO: Chop path? | |
| 187 | - | |
| 188 | - | |
| 189 | - | |
| 190 | - # Double check all dependencies have been met | |
| 191 | - complete = True | |
| 192 | - try: | |
| 193 | - for fp in order: | |
| 194 | - if max([order.index(rfp) for rfp in files[fp].requires] + | |
| 195 | - [order.index(fp)]) != order.index(fp): | |
| 196 | - complete = False | |
| 197 | - except: | |
| 198 | - complete = False | |
| 199 | - | |
| 200 | ||
| 201 | - | |
| 202 | - | |
| 203 | - ## Move forced first and last files to the required position | |
| 204 | - if cfg: | |
| 205 | - print "Re-ordering files..." | |
| 206 | - order = cfg.forceFirst + [item | |
| 207 | - for item in order | |
| 208 | - if ((item not in cfg.forceFirst) and | |
| 209 | - (item not in cfg.forceLast))] + cfg.forceLast | |
| 210 | - | |
| 211 | ||
| 212 | - ## Output the files in the determined order | |
| 213 | - result = [] | |
| 214 | - | |
| 215 | - for fp in order: | |
| 216 | - f = files[fp] | |
| 217 | - print "Exporting: ", f.filepath | |
| 218 | - result.append(HEADER % f.filepath) | |
| 219 | - source = f.source | |
| 220 | - result.append(source) | |
| 221 | - if not source.endswith("\n"): | |
| 222 | - result.append("\n") | |
| 223 | - | |
| 224 | - print "\nTotal files merged: %d " % len(files) | |
| 225 | - | |
| 226 | - if outputFilename: | |
| 227 | - print "\nGenerating: %s" % (outputFilename) | |
| 228 | - open(outputFilename, "w").write("".join(result)) | |
| 229 | - return "".join(result) | |
| 230 | - | |
| 231 | -if __name__ == "__main__": | |
| 232 | - import getopt | |
| 233 | - | |
| 234 | - options, args = getopt.getopt(sys.argv[1:], "-c:") | |
| 235 | - | |
| 236 | - try: | |
| 237 | - outputFilename = args[0] | |
| 238 | - except IndexError: | |
| 239 | - usage(sys.argv[0]) | |
| 240 | - raise SystemExit | |
| 241 | - else: | |
| 242 | - sourceDirectory = args[1] | |
| 243 | - if not sourceDirectory: | |
| 244 | - usage(sys.argv[0]) | |
| 245 | - raise SystemExit | |
| 246 | - | |
| 247 | - configFile = None | |
| 248 | - if options and options[0][0] == "-c": | |
| 249 | - configFile = options[0][1] | |
| 250 | - print "Parsing configuration file: %s" % filename | |
| 251 | - | |
| 252 | - run( sourceDirectory, outputFilename, configFile ) |
pacotes/openlayers/tools/minimize.py
| ... | ... | @@ -1,47 +0,0 @@ |
| 1 | -# Minimal Python Minimizer | |
| 2 | -# Copyright 2008, Christopher Schmidt | |
| 3 | -# Released under the MIT License | |
| 4 | -# | |
| 5 | -# Taken from: http://svn.crschmidt.net/personal/python/minimize.py | |
| 6 | -# $Id: minimize.py 6 2008-01-03 06:33:35Z crschmidt $ | |
| 7 | -# | |
| 8 | -# Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 9 | -# of this software and associated documentation files (the "Software"), to deal | |
| 10 | -# in the Software without restriction, including without limitation the rights | |
| 11 | -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 12 | -# copies of the Software, and to permit persons to whom the Software is | |
| 13 | -# furnished to do so, subject to the following conditions: | |
| 14 | -# | |
| 15 | -# The above copyright notice and this permission notice shall be included in | |
| 16 | -# all copies or substantial portions of the Software. | |
| 17 | -# | |
| 18 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 19 | -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 20 | -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 21 | -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 22 | -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 23 | -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| 24 | -# THE SOFTWARE. | |
| 25 | - | |
| 26 | -import re | |
| 27 | - | |
| 28 | -def strip_comments_helper(data): | |
| 29 | - """remove all /* */ format comments and surrounding whitespace.""" | |
| 30 | - p = re.compile(r'[\s]*/\*.*?\*/[\s]*', re.DOTALL) | |
| 31 | - return p.sub('',data) | |
| 32 | - | |
| 33 | -def minimize(data, exclude=None): | |
| 34 | - """Central function call. This will call all other compression | |
| 35 | - functions. To add further compression algorithms, simply add | |
| 36 | - functions whose names end in _helper which take a string as input | |
| 37 | - and return a more compressed string as output.""" | |
| 38 | - for key, item in globals().iteritems(): | |
| 39 | - if key.endswith("_helper"): | |
| 40 | - func_key = key[:-7] | |
| 41 | - if not exclude or not func_key in exclude: | |
| 42 | - data = item(data) | |
| 43 | - return data | |
| 44 | - | |
| 45 | -if __name__ == "__main__": | |
| 46 | - import sys | |
| 47 | - print minimize(open(sys.argv[1]).read()) |
pacotes/openlayers/tools/oldot.py
| ... | ... | @@ -1,43 +0,0 @@ |
| 1 | -import re | |
| 2 | -import os | |
| 3 | -def run(): | |
| 4 | - sourceDirectory = "../lib/OpenLayers" | |
| 5 | - allFiles = [] | |
| 6 | - SUFFIX_JAVASCRIPT = ".js" | |
| 7 | - ## Find all the Javascript source files | |
| 8 | - for root, dirs, files in os.walk(sourceDirectory): | |
| 9 | - for filename in files: | |
| 10 | - if filename.endswith(SUFFIX_JAVASCRIPT) and not filename.startswith("."): | |
| 11 | - filepath = os.path.join(root, filename)[len(sourceDirectory)+1:] | |
| 12 | - filepath = filepath.replace("\\", "/") | |
| 13 | - data = open(os.path.join(sourceDirectory, filepath)).read() | |
| 14 | - parents = re.search("OpenLayers.Class\((.*?){", data, | |
| 15 | - re.DOTALL) | |
| 16 | - if parents: | |
| 17 | - parents = [x.strip() for x in parents.group(1).strip().strip(",").split(",")] | |
| 18 | - else: | |
| 19 | - parents = [] | |
| 20 | - cls = "OpenLayers.%s" % filepath.strip(".js").replace("/", ".") | |
| 21 | - allFiles.append([cls, parents]) | |
| 22 | - return allFiles | |
| 23 | -print """ | |
| 24 | -digraph name { | |
| 25 | - fontname = "Helvetica" | |
| 26 | - fontsize = 8 | |
| 27 | - K = 0.6 | |
| 28 | - | |
| 29 | - node [ | |
| 30 | - fontname = "Helvetica" | |
| 31 | - fontsize = 8 | |
| 32 | - shape = "plaintext" | |
| 33 | - ] | |
| 34 | -""" | |
| 35 | - | |
| 36 | -for i in run(): | |
| 37 | - print i[0].replace(".", "_") | |
| 38 | - for item in i[1]: | |
| 39 | - if not item: continue | |
| 40 | - print "%s -> %s" % (i[0].replace(".","_"), item.replace(".", "_")) | |
| 41 | - print "; " | |
| 42 | - | |
| 43 | -print """}""" |
pacotes/openlayers/tools/release.sh
| ... | ... | @@ -1,29 +0,0 @@ |
| 1 | -#!/bin/sh | |
| 2 | - | |
| 3 | -VERSION=$1 | |
| 4 | - | |
| 5 | -svn export http://svn.openlayers.org/tags/openlayers/release-$VERSION OpenLayers-$VERSION | |
| 6 | -cd OpenLayers-$VERSION/build | |
| 7 | -./build.py full | |
| 8 | -cp OpenLayers.js .. | |
| 9 | - | |
| 10 | -cd .. | |
| 11 | - | |
| 12 | -mkdir doc/devdocs | |
| 13 | -mkdir doc/apidocs | |
| 14 | -rm tools/*.pyc | |
| 15 | - | |
| 16 | -mkdir /www/openlayers/htdocs/api/$VERSION | |
| 17 | -cp OpenLayers.js /www/openlayers/htdocs/api/$VERSION | |
| 18 | -cp -a img/ /www/openlayers/htdocs/api/$VERSION | |
| 19 | -cp -a theme/ /www/openlayers/htdocs/api/$VERSION | |
| 20 | - | |
| 21 | -cd .. | |
| 22 | - | |
| 23 | -~/nd/NaturalDocs -i OpenLayers-$VERSION/lib -o HTML OpenLayers-$VERSION/doc/devdocs -p OpenLayers-$VERSION/doc_config -s Small OL | |
| 24 | -~/nd/NaturalDocs -i OpenLayers-$VERSION/lib -o HTML OpenLayers-$VERSION/doc/apidocs -p OpenLayers-$VERSION/apidoc_config -s Small OL | |
| 25 | - | |
| 26 | -tar cvfz OpenLayers-$VERSION.tar.gz OpenLayers-$VERSION/ | |
| 27 | -zip -9r OpenLayers-$VERSION.zip OpenLayers-$VERSION/ | |
| 28 | - | |
| 29 | -cp OpenLayers-$VERSION.* /www/openlayers/htdocs/download |
pacotes/openlayers/tools/shrinksafe.py
| ... | ... | @@ -1,54 +0,0 @@ |
| 1 | -#!/usr/bin/env python | |
| 2 | -# | |
| 3 | -# Script to provide a wrapper around the ShrinkSafe "web service" | |
| 4 | -# <http://shrinksafe.dojotoolkit.org/> | |
| 5 | -# | |
| 6 | - | |
| 7 | -# | |
| 8 | -# We use this script for two reasons: | |
| 9 | -# | |
| 10 | -# * This avoids having to install and configure Java and the standalone | |
| 11 | -# ShrinkSafe utility. | |
| 12 | -# | |
| 13 | -# * The current ShrinkSafe standalone utility was broken when we last | |
| 14 | -# used it. | |
| 15 | -# | |
| 16 | - | |
| 17 | -import sys | |
| 18 | - | |
| 19 | -import urllib | |
| 20 | -import urllib2 | |
| 21 | - | |
| 22 | -URL_SHRINK_SAFE = "http://shrinksafe.dojotoolkit.org/shrinksafe.php" | |
| 23 | - | |
| 24 | -# This would normally be dynamically generated: | |
| 25 | -BOUNDARY_MARKER = "---------------------------72288400411964641492083565382" | |
| 26 | - | |
| 27 | -if __name__ == "__main__": | |
| 28 | - ## Grab the source code | |
| 29 | - try: | |
| 30 | - sourceFilename = sys.argv[1] | |
| 31 | - except: | |
| 32 | - print "Usage: %s (<source filename>|-)" % sys.argv[0] | |
| 33 | - raise SystemExit | |
| 34 | - | |
| 35 | - if sourceFilename == "-": | |
| 36 | - sourceCode = sys.stdin.read() | |
| 37 | - sourceFilename = "stdin.js" | |
| 38 | - else: | |
| 39 | - sourceCode = open(sourceFilename).read() | |
| 40 | - | |
| 41 | - ## Create the request replicating posting of the form from the web page | |
| 42 | - request = urllib2.Request(url=URL_SHRINK_SAFE) | |
| 43 | - request.add_header("Content-Type", | |
| 44 | - "multipart/form-data; boundary=%s" % BOUNDARY_MARKER) | |
| 45 | - request.add_data(""" | |
| 46 | ---%s | |
| 47 | -Content-Disposition: form-data; name="shrinkfile[]"; filename="%s" | |
| 48 | -Content-Type: application/x-javascript | |
| 49 | - | |
| 50 | -%s | |
| 51 | -""" % (BOUNDARY_MARKER, sourceFilename, sourceCode)) | |
| 52 | - | |
| 53 | - ## Deliver the result | |
| 54 | - print urllib2.urlopen(request).read(), |
pacotes/openlayers/tools/toposort.py
| ... | ... | @@ -1,260 +0,0 @@ |
| 1 | -# | |
| 2 | -# According to <http://www.vrplumber.com/programming/> this file | |
| 3 | -# is licensed under a BSD-style license. We only use the section | |
| 4 | -# originally by Tim Peters. | |
| 5 | -# | |
| 6 | -# TODO: The use of this code needs to be okayed by someone. | |
| 7 | -# | |
| 8 | - | |
| 9 | -class RecursionError( OverflowError, ValueError ): | |
| 10 | - '''Unable to calculate result because of recursive structure''' | |
| 11 | - | |
| 12 | - | |
| 13 | -def sort(nodes, routes, noRecursion=1): | |
| 14 | - '''Passed a list of node IDs and a list of source,dest ID routes | |
| 15 | - attempt to create a list of stages where each sub list | |
| 16 | - is one stage in a process. | |
| 17 | - ''' | |
| 18 | - children, parents = _buildChildrenLists(routes) | |
| 19 | - # first stage is those nodes | |
| 20 | - # having no incoming routes... | |
| 21 | - stage = [] | |
| 22 | - stages = [stage] | |
| 23 | - taken = [] | |
| 24 | - for node in nodes: | |
| 25 | - if (not parents.get(node)): | |
| 26 | - stage.append (node) | |
| 27 | - if nodes and not stage: | |
| 28 | - # there is no element which does not depend on | |
| 29 | - # some other element!!! | |
| 30 | - stage.append( nodes[0]) | |
| 31 | - taken.extend( stage ) | |
| 32 | - nodes = filter ( lambda x, l=stage: x not in l, nodes ) | |
| 33 | - while nodes: | |
| 34 | - previousStageChildren = [] | |
| 35 | - nodelen = len(nodes) | |
| 36 | - # second stage are those nodes | |
| 37 | - # which are direct children of the first stage | |
| 38 | - for node in stage: | |
| 39 | - for child in children.get (node, []): | |
| 40 | - if child not in previousStageChildren and child not in taken: | |
| 41 | - previousStageChildren.append(child) | |
| 42 | - elif child in taken and noRecursion: | |
| 43 | - raise RecursionError( (child, node) ) | |
| 44 | - # unless they are children of other direct children... | |
| 45 | - # TODO, actually do that... | |
| 46 | - stage = previousStageChildren | |
| 47 | - removes = [] | |
| 48 | - for current in stage: | |
| 49 | - currentParents = parents.get( current, [] ) | |
| 50 | - for parent in currentParents: | |
| 51 | - if parent in stage and parent != current: | |
| 52 | - # might wind up removing current... | |
| 53 | - if not current in parents.get(parent, []): | |
| 54 | - # is not mutually dependent... | |
| 55 | - removes.append( current ) | |
| 56 | - for remove in removes: | |
| 57 | - while remove in stage: | |
| 58 | - stage.remove( remove ) | |
| 59 | - stages.append( stage) | |
| 60 | - taken.extend( stage ) | |
| 61 | - nodes = filter ( lambda x, l=stage: x not in l, nodes ) | |
| 62 | - if nodelen == len(nodes): | |
| 63 | - if noRecursion: | |
| 64 | - raise RecursionError( nodes ) | |
| 65 | - else: | |
| 66 | - stages.append( nodes[:] ) | |
| 67 | - nodes = [] | |
| 68 | - return stages | |
| 69 | - | |
| 70 | -def _buildChildrenLists (routes): | |
| 71 | - childrenTable = {} | |
| 72 | - parentTable = {} | |
| 73 | - for sourceID,destinationID in routes: | |
| 74 | - currentChildren = childrenTable.get( sourceID, []) | |
| 75 | - currentParents = parentTable.get( destinationID, []) | |
| 76 | - if not destinationID in currentChildren: | |
| 77 | - currentChildren.append ( destinationID) | |
| 78 | - if not sourceID in currentParents: | |
| 79 | - currentParents.append ( sourceID) | |
| 80 | - childrenTable[sourceID] = currentChildren | |
| 81 | - parentTable[destinationID] = currentParents | |
| 82 | - return childrenTable, parentTable | |
| 83 | - | |
| 84 | - | |
| 85 | -def toposort (nodes, routes, noRecursion=1): | |
| 86 | - '''Topological sort from Tim Peters, fairly efficient | |
| 87 | - in comparison (it seems).''' | |
| 88 | - #first calculate the recursion depth | |
| 89 | - | |
| 90 | - dependencies = {} | |
| 91 | - inversedependencies = {} | |
| 92 | - if not nodes: | |
| 93 | - return [] | |
| 94 | - if not routes: | |
| 95 | - return [nodes] | |
| 96 | - for node in nodes: | |
| 97 | - dependencies[ node ] = (0, node) | |
| 98 | - inversedependencies[ node ] = [] | |
| 99 | - | |
| 100 | - | |
| 101 | - for depended, depends in routes: | |
| 102 | - # is it a null rule | |
| 103 | - try: | |
| 104 | - newdependencylevel, object = dependencies.get ( depends, (0, depends)) | |
| 105 | - except TypeError: | |
| 106 | - print depends | |
| 107 | - raise | |
| 108 | - dependencies[ depends ] = (newdependencylevel + 1, depends) | |
| 109 | - # "dependency (existence) of depended-on" | |
| 110 | - newdependencylevel,object = dependencies.get ( depended, (0, depended) ) | |
| 111 | - dependencies[ depended ] = (newdependencylevel, depended) | |
| 112 | - # Inverse dependency set up | |
| 113 | - dependencieslist = inversedependencies.get ( depended, []) | |
| 114 | - dependencieslist.append (depends) | |
| 115 | - inversedependencies[depended] = dependencieslist | |
| 116 | - ### Now we do the actual sorting | |
| 117 | - # The first task is to create the sortable | |
| 118 | - # list of dependency-levels | |
| 119 | - sortinglist = dependencies.values() | |
| 120 | - sortinglist.sort () | |
| 121 | - output = [] | |
| 122 | - while sortinglist: | |
| 123 | - deletelist = [] | |
| 124 | - generation = [] | |
| 125 | - output.append( generation) | |
| 126 | - while sortinglist and sortinglist[0][0] == 0: | |
| 127 | - number, object = sortinglist[0] | |
| 128 | - generation.append ( object ) | |
| 129 | - deletelist.append( object ) | |
| 130 | - for inverse in inversedependencies.get(object, () ): | |
| 131 | - try: | |
| 132 | - oldcount, inverse = dependencies [ inverse] | |
| 133 | - if oldcount > 0: | |
| 134 | - # will be dealt with on later pass | |
| 135 | - dependencies [ inverse] = (oldcount-1, inverse) | |
| 136 | - else: | |
| 137 | - # will be dealt with on this pass, | |
| 138 | - # so needs not to be in the sorting list next time | |
| 139 | - deletelist.append( inverse ) | |
| 140 | - # just in case a loop comes through | |
| 141 | - inversedependencies[object] = [] | |
| 142 | - except KeyError: | |
| 143 | - # dealing with a recursion-breaking run... | |
| 144 | - pass | |
| 145 | - del sortinglist [0] | |
| 146 | - # if no elements could be deleted, then | |
| 147 | - # there is something which depends upon itself | |
| 148 | - if not deletelist: | |
| 149 | - if noRecursion: | |
| 150 | - raise RecursionError( sortinglist ) | |
| 151 | - else: | |
| 152 | - # hack so that something gets deleted... | |
| 153 | -## import pdb | |
| 154 | -## pdb.set_trace() | |
| 155 | - dependencies[sortinglist[0][1]] = (0,sortinglist[0][1]) | |
| 156 | - # delete the items that were dealt with | |
| 157 | - for item in deletelist: | |
| 158 | - try: | |
| 159 | - del dependencies [ item ] | |
| 160 | - except KeyError: | |
| 161 | - pass | |
| 162 | - # need to recreate the sortinglist | |
| 163 | - sortinglist = dependencies.values() | |
| 164 | - if not generation: | |
| 165 | - output.remove( generation ) | |
| 166 | - sortinglist.sort () | |
| 167 | - return output | |
| 168 | - | |
| 169 | - | |
| 170 | - | |
| 171 | - | |
| 172 | - | |
| 173 | -if __name__ == "__main__": | |
| 174 | - | |
| 175 | - nodes = ['a', 'b', 'c', 'd', 'e', 'f'] | |
| 176 | - route = [('a', 'b'), ('b', 'c'), ('b', 'd'), ('e','f')] | |
| 177 | - | |
| 178 | - for x in toposort( nodes, route): | |
| 179 | - for a in x: | |
| 180 | - print a | |
| 181 | - | |
| 182 | - raise SystemExit | |
| 183 | - | |
| 184 | - | |
| 185 | - | |
| 186 | - import pprint, traceback | |
| 187 | - nodes= [ 0,1,2,3,4,5 ] | |
| 188 | - testingValues = [ | |
| 189 | - [ (0,1),(1,2),(2,3),(3,4),(4,5)], | |
| 190 | - [ (0,1),(0,2),(1,2),(3,4),(4,5)], | |
| 191 | - [ | |
| 192 | - (0,1), | |
| 193 | - (0,2), | |
| 194 | - (0,2), | |
| 195 | - (2,4), | |
| 196 | - (2,5), | |
| 197 | - (3,2), | |
| 198 | - (0,3)], | |
| 199 | - [ | |
| 200 | - (0,1), # 3-element cycle test, no orphan nodes | |
| 201 | - (1,2), | |
| 202 | - (2,0), | |
| 203 | - (2,4), | |
| 204 | - (2,5), | |
| 205 | - (3,2), | |
| 206 | - (0,3)], | |
| 207 | - [ | |
| 208 | - (0,1), | |
| 209 | - (1,1), | |
| 210 | - (1,1), | |
| 211 | - (1,4), | |
| 212 | - (1,5), | |
| 213 | - (1,2), | |
| 214 | - (3,1), | |
| 215 | - (2,1), | |
| 216 | - (2,0)], | |
| 217 | - [ | |
| 218 | - (0,1), | |
| 219 | - (1,0), | |
| 220 | - (0,2), | |
| 221 | - (0,3), | |
| 222 | - ], | |
| 223 | - [ | |
| 224 | - (0,1), | |
| 225 | - (1,0), | |
| 226 | - (0,2), | |
| 227 | - (3,1), | |
| 228 | - ], | |
| 229 | - ] | |
| 230 | - print 'sort, no recursion allowed' | |
| 231 | - for index in range(len(testingValues)): | |
| 232 | -## print ' %s -- %s'%( index, testingValues[index]) | |
| 233 | - try: | |
| 234 | - print ' ', sort( nodes, testingValues[index] ) | |
| 235 | - except: | |
| 236 | - print 'exception raised' | |
| 237 | - print 'toposort, no recursion allowed' | |
| 238 | - for index in range(len(testingValues)): | |
| 239 | -## print ' %s -- %s'%( index, testingValues[index]) | |
| 240 | - try: | |
| 241 | - print ' ', toposort( nodes, testingValues[index] ) | |
| 242 | - except: | |
| 243 | - print 'exception raised' | |
| 244 | - print 'sort, recursion allowed' | |
| 245 | - for index in range(len(testingValues)): | |
| 246 | -## print ' %s -- %s'%( index, testingValues[index]) | |
| 247 | - try: | |
| 248 | - print ' ', sort( nodes, testingValues[index],0 ) | |
| 249 | - except: | |
| 250 | - print 'exception raised' | |
| 251 | - print 'toposort, recursion allowed' | |
| 252 | - for index in range(len(testingValues)): | |
| 253 | -## print ' %s -- %s'%( index, testingValues[index]) | |
| 254 | - try: | |
| 255 | - print ' ', toposort( nodes, testingValues[index],0 ) | |
| 256 | - except: | |
| 257 | - print 'exception raised' | |
| 258 | - | |
| 259 | - | |
| 260 | - |
pacotes/openlayers/tools/update_dev_dir.sh
| ... | ... | @@ -1,45 +0,0 @@ |
| 1 | -#!/bin/sh | |
| 2 | - | |
| 3 | -# Used to update http://openlayers.org/dev/ | |
| 4 | - | |
| 5 | -svn up /www/openlayers/docs/dev; | |
| 6 | - | |
| 7 | -# Get current 'Last Changed Rev' | |
| 8 | -REV=`svn info /www/openlayers/docs/dev/ | grep 'Last Changed Rev' | awk '{print $4}'` | |
| 9 | - | |
| 10 | -# Get the last svn rev | |
| 11 | -touch /tmp/ol_svn_rev | |
| 12 | -OLD_REV="o`cat /tmp/ol_svn_rev`" | |
| 13 | - | |
| 14 | -# If they're not equal, do some work. | |
| 15 | -if [ ! o$REV = $OLD_REV ]; then | |
| 16 | - | |
| 17 | - cd /www/openlayers/docs/dev/tools/ | |
| 18 | - python exampleparser.py | |
| 19 | - cd /www/openlayers/docs/dev/build | |
| 20 | - ./build.py | |
| 21 | - | |
| 22 | - cp OpenLayers.js .. | |
| 23 | - cd .. | |
| 24 | - | |
| 25 | - sed -i -e 's!../lib/OpenLayers.js!../OpenLayers.js!' examples/*.html | |
| 26 | - perl /home/crschmidt/NaturalDocs -i /www/openlayers/docs/dev/lib -o HTML /www/openlayers/dev/apidocs -p /www/openlayers/docs/dev/apidoc_config -s Default OL >/dev/null | |
| 27 | - perl /home/crschmidt/NaturalDocs -i /www/openlayers/docs/dev/lib -o HTML /www/openlayers/dev/docs -p /www/openlayers/docs/dev/doc_config -s Default OL >/dev/null | |
| 28 | - | |
| 29 | - # Record the revision | |
| 30 | - echo -n $REV > /tmp/ol_svn_rev | |
| 31 | -fi | |
| 32 | - | |
| 33 | -svn up /www/openlayers/documentation-checkout | |
| 34 | -REV=`svn info /www/openlayers/documentation-checkout | grep 'Last Changed Rev' | awk '{print $4}'` | |
| 35 | -# Get the last svn rev | |
| 36 | -touch /tmp/ol_doc_rev | |
| 37 | -OLD_REV="o`cat /tmp/ol_doc_rev`" | |
| 38 | -# If they're not equal, do some work. | |
| 39 | -if [ ! o$REV = $OLD_REV ]; then | |
| 40 | - cd /www/openlayers/documentation-checkout | |
| 41 | - make html > /dev/null | |
| 42 | - cp -r _build/html/* /www/openlayers/documentation | |
| 43 | - | |
| 44 | - echo -n $REV > /tmp/ol_doc_rev | |
| 45 | -fi |