Package lxml :: Package html
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  import urlparse 
   7  import copy 
   8  from lxml import etree 
   9  from lxml.html import defs 
  10  from lxml import cssselect 
  11  from lxml.html._setmixin import SetMixin 
  12  try: 
  13      from UserDict import DictMixin 
  14  except ImportError: 
  15      # DictMixin was introduced in Python 2.4 
  16      from lxml.html._dictmixin import DictMixin 
  17  import sets 
  18   
  19  __all__ = [ 
  20      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  21      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  22      'find_rel_links', 'find_class', 'make_links_absolute', 
  23      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  24   
  25  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  26  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  27  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  28  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  29  _collect_string_content = etree.XPath("string()") 
  30  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  31  _css_import_re = re.compile(r'@import "(.*?)"') 
  32  _label_xpath = etree.XPath("//label[@for=$id]") 
  33  _archive_re = re.compile(r'[^ ]+') 
  34   
35 -class HtmlMixin(object):
36
37 - def base_url(self):
38 """ 39 Returns the base URL, given when the page was parsed. 40 41 Use with ``urlparse.urljoin(el.base_url, href)`` to get 42 absolute URLs. 43 """ 44 return self.getroottree().docinfo.URL
45 base_url = property(base_url, doc=base_url.__doc__) 46
47 - def forms(self):
48 """ 49 Return a list of all the forms 50 """ 51 return list(self.getiterator('form'))
52 forms = property(forms, doc=forms.__doc__) 53
54 - def body(self):
55 """ 56 Return the <body> element. Can be called from a child element 57 to get the document's head. 58 """ 59 return self.xpath('//body')[0]
60 body = property(body, doc=body.__doc__) 61
62 - def head(self):
63 """ 64 Returns the <head> element. Can be called from a child 65 element to get the document's head. 66 """ 67 return self.xpath('//head')[0]
68 head = property(head, doc=head.__doc__) 69
70 - def _label__get(self):
71 """ 72 Get or set any <label> element associated with this element. 73 """ 74 id = self.get('id') 75 if not id: 76 return None 77 result = _label_xpath(self, id=id) 78 if not result: 79 return None 80 else: 81 return result[0]
82 - def _label__set(self, label):
83 id = self.get('id') 84 if not id: 85 raise TypeError( 86 "You cannot set a label for an element (%r) that has no id" 87 % self) 88 if not label.tag == 'label': 89 raise TypeError( 90 "You can only assign label to a label element (not %r)" 91 % label) 92 label.set('for', id)
93 - def _label__del(self):
94 label = self.label 95 if label is not None: 96 del label.attrib['for']
97 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 98
99 - def drop_tree(self):
100 """ 101 Removes this element from the tree, including its children and 102 text. The tail text is joined to the previous element or 103 parent. 104 """ 105 parent = self.getparent() 106 assert parent is not None 107 if self.tail: 108 previous = self.getprevious() 109 if previous is None: 110 parent.text = (parent.text or '') + self.tail 111 else: 112 previous.tail = (previous.tail or '') + self.tail 113 parent.remove(self)
114
115 - def drop_tag(self):
116 """ 117 Remove the tag, but not its children or text. The children and text 118 are merged into the parent. 119 120 Example:: 121 122 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 123 >>> h.find('.//b').drop_tag() 124 >>> print tostring(h) 125 <div>Hello World!</div> 126 """ 127 parent = self.getparent() 128 assert parent is not None 129 previous = self.getprevious() 130 if self.text and isinstance(self.tag, basestring): 131 # not a Comment, etc. 132 if previous is None: 133 parent.text = (parent.text or '') + self.text 134 else: 135 previous.tail = (previous.tail or '') + self.text 136 if self.tail: 137 if len(self): 138 last = self[-1] 139 last.tail = (last.tail or '') + self.tail 140 elif previous is None: 141 parent.text = (parent.text or '') + self.tail 142 else: 143 previous.tail = (previous.tail or '') + self.tail 144 index = parent.index(self) 145 parent[index:index+1] = self[:]
146 154
155 - def find_class(self, class_name):
156 """ 157 Find any elements with the given class name. 158 """ 159 return _class_xpath(self, class_name=class_name)
160
161 - def get_element_by_id(self, id, *default):
162 """ 163 Get the first element in a document with the given id. If none is 164 found, return the default argument if provided or raise KeyError 165 otherwise. 166 167 Note that there can be more than one element with the same id, 168 and this isn't uncommon in HTML documents found in the wild. 169 Browsers return only the first match, and this function does 170 the same. 171 """ 172 try: 173 # FIXME: should this check for multiple matches? 174 # browsers just return the first one 175 return _id_xpath(self, id=id)[0] 176 except IndexError: 177 if default: 178 return default[0] 179 else: 180 raise KeyError, id
181
182 - def text_content(self):
183 """ 184 Return the text content of the tag (and the text in any children). 185 """ 186 return _collect_string_content(self)
187
188 - def cssselect(self, expr):
189 """ 190 Run the CSS expression on this element and its children, 191 returning a list of the results. 192 193 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 194 that pre-compiling the expression can provide a substantial 195 speedup. 196 """ 197 return cssselect.CSSSelect(expr)(self)
198 199 ######################################## 200 ## Link functions 201 ######################################## 202 222 self.rewrite_links(link_repl)
223
224 - def resolve_base_href(self):
225 """ 226 Find any ``<base href>`` tag in the document, and apply its 227 values to all links found in the document. Also remove the 228 tag once it has been applied. 229 """ 230 base_href = None 231 basetags = self.xpath('//base[@href]') 232 for b in basetags: 233 base_href = b.get('href') 234 b.drop_tree() 235 if not base_href: 236 return 237 self.make_links_absolute(base_href, resolve_base_href=False)
238 293 340 341
342 -class _MethodFunc(object):
343 """ 344 An object that represents a method on an element as a function; 345 the function takes either an element or an HTML string. It 346 returns whatever the function normally returns, or if the function 347 works in-place (and so returns None) it returns a serialized form 348 of the resulting document. 349 """
350 - def __init__(self, name, copy=False, source_class=HtmlMixin):
351 self.name = name 352 self.copy = copy 353 self.__doc__ = getattr(source_class, self.name).__doc__
354 - def __call__(self, doc, *args, **kw):
355 if isinstance(doc, basestring): 356 if 'copy' in kw: 357 raise TypeError( 358 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 359 return_string = True 360 doc = fromstring(doc, **kw) 361 else: 362 if 'copy' in kw: 363 copy = kw.pop('copy') 364 else: 365 copy = self.copy 366 return_string = False 367 if copy: 368 doc = copy.deepcopy(doc) 369 meth = getattr(doc, self.name) 370 result = meth(*args, **kw) 371 # FIXME: this None test is a bit sloppy 372 if result is None: 373 # Then return what we got in 374 if return_string: 375 return tostring(doc) 376 else: 377 return doc 378 else: 379 return result
380 381 find_rel_links = _MethodFunc('find_rel_links', copy=False) 382 find_class = _MethodFunc('find_class', copy=False) 383 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 384 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 385 iterlinks = _MethodFunc('iterlinks', copy=False) 386 rewrite_links = _MethodFunc('rewrite_links', copy=True) 387
388 -class HtmlComment(etree.CommentBase, HtmlMixin):
389 pass
390
391 -class HtmlElement(etree.ElementBase, HtmlMixin):
392 pass
393
394 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
395 pass
396
397 -class HtmlEntity(etree.EntityBase, HtmlMixin):
398 pass
399 400
401 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
402 """A lookup scheme for HTML Element classes. 403 404 To create a lookup instance with different Element classes, pass a tag 405 name mapping of Element classes in the ``classes`` keyword argument and/or 406 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 407 The special key '*' denotes a Mixin class that should be mixed into all 408 Element classes. 409 """ 410 _default_element_classes = {} 411
412 - def __init__(self, classes=None, mixins=None):
413 etree.CustomElementClassLookup.__init__(self) 414 if classes is None: 415 classes = self._default_element_classes.copy() 416 if mixins: 417 mixers = {} 418 for name, value in mixins: 419 if name == '*': 420 for n in classes.keys(): 421 mixers.setdefault(n, []).append(value) 422 else: 423 mixers.setdefault(name, []).append(value) 424 for name, mix_bases in mixers.items(): 425 cur = classes.get(name, HtmlElement) 426 bases = tuple(mix_bases + [cur]) 427 classes[name] = type(cur.__name__, bases, {}) 428 self._element_classes = classes
429
430 - def lookup(self, node_type, document, namespace, name):
431 if node_type == 'element': 432 return self._element_classes.get(name.lower(), HtmlElement) 433 elif node_type == 'comment': 434 return HtmlComment 435 elif node_type == 'PI': 436 return HtmlProcessingInstruction 437 elif node_type == 'entity': 438 return HtmlEntity 439 # Otherwise normal lookup 440 return None
441 442 ################################################################################ 443 # parsing 444 ################################################################################ 445
446 -def document_fromstring(html, **kw):
447 value = etree.HTML(html, html_parser, **kw) 448 if value is None: 449 raise etree.ParserError( 450 "Document is empty") 451 return value
452
453 -def fragments_fromstring(html, no_leading_text=False, base_url=None, **kw):
454 """ 455 Parses several HTML elements, returning a list of elements. 456 457 The first item in the list may be a string (though leading 458 whitespace is removed). If no_leading_text is true, then it will 459 be an error if there is leading text, and it will always be a list 460 of only elements. 461 462 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 463 """ 464 # FIXME: check what happens when you give html with a body, head, etc. 465 start = html[:20].lstrip().lower() 466 if not start.startswith('<html') and not start.startswith('<!doctype'): 467 html = '<html><body>%s</body></html>' % html 468 doc = document_fromstring(html, base_url=base_url, **kw) 469 assert doc.tag == 'html' 470 bodies = [e for e in doc if e.tag == 'body'] 471 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 472 body = bodies[0] 473 elements = [] 474 if no_leading_text and body.text and body.text.strip(): 475 raise etree.ParserError( 476 "There is leading text: %r" % body.text) 477 if body.text and body.text.strip(): 478 elements.append(body.text) 479 elements.extend(body) 480 # FIXME: removing the reference to the parent artificial document 481 # would be nice 482 return elements
483
484 -def fragment_fromstring(html, create_parent=False, base_url=None, **kw):
485 """ 486 Parses a single HTML element; it is an error if there is more than 487 one element, or if anything but whitespace precedes or follows the 488 element. 489 490 If create_parent is true (or is a tag name) then a parent node 491 will be created to encapsulate the HTML in a single element. 492 493 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 494 """ 495 if create_parent: 496 if not isinstance(create_parent, basestring): 497 create_parent = 'div' 498 return fragment_fromstring('<%s>%s</%s>' % ( 499 create_parent, html, create_parent), base_url=base_url, **kw) 500 elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw) 501 if not elements: 502 raise etree.ParserError( 503 "No elements found") 504 if len(elements) > 1: 505 raise etree.ParserError( 506 "Multiple elements found (%s)" 507 % ', '.join([_element_name(e) for e in elements])) 508 el = elements[0] 509 if el.tail and el.tail.strip(): 510 raise etree.ParserError( 511 "Element followed by text: %r" % el.tail) 512 el.tail = None 513 return el
514
515 -def fromstring(html, base_url=None, **kw):
516 """ 517 Parse the html, returning a single element/document. 518 519 This tries to minimally parse the chunk of text, without knowing if it 520 is a fragment or a document. 521 522 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 523 """ 524 start = html[:10].lstrip().lower() 525 if start.startswith('<html') or start.startswith('<!doctype'): 526 # Looks like a full HTML document 527 return document_fromstring(html, base_url=base_url, **kw) 528 # otherwise, lets parse it out... 529 doc = document_fromstring(html, base_url=base_url, **kw) 530 bodies = doc.findall('body') 531 if bodies: 532 body = bodies[0] 533 if len(bodies) > 1: 534 # Somehow there are multiple bodies, which is bad, but just 535 # smash them into one body 536 for other_body in bodies[1:]: 537 if other_body.text: 538 if len(body): 539 body[-1].tail = (body[-1].tail or '') + other_body.text 540 else: 541 body.text = (body.text or '') + other_body.text 542 body.extend(other_body) 543 # We'll ignore tail 544 # I guess we are ignoring attributes too 545 other_body.drop_tree() 546 else: 547 body = None 548 heads = doc.findall('head') 549 if heads: 550 # Well, we have some sort of structure, so lets keep it all 551 head = heads[0] 552 if len(heads) > 1: 553 for other_head in heads[1:]: 554 head.extend(other_head) 555 # We don't care about text or tail in a head 556 other_head.drop_tree() 557 return doc 558 if (len(body) == 1 and (not body.text or not body.text.strip()) 559 and (not body[-1].tail or not body[-1].tail.strip())): 560 # The body has just one element, so it was probably a single 561 # element passed in 562 return body[0] 563 # Now we have a body which represents a bunch of tags which have the 564 # content that was passed in. We will create a fake container, which 565 # is the body tag, except <body> implies too much structure. 566 if _contains_block_level_tag(body): 567 body.tag = 'div' 568 else: 569 body.tag = 'span' 570 return body
571
572 -def parse(filename_or_url, parser=None, base_url=None, **kw):
573 """ 574 Parse a filename, URL, or file-like object into an HTML document 575 tree. Note: this returns a tree, not an element. Use 576 ``parse(...).getroot()`` to get the document root. 577 578 You can override the base URL with the ``base_url`` keyword. This 579 is most useful when parsing from a file-like object. 580 """ 581 if parser is None: 582 parser = html_parser 583 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
584
585 -def _contains_block_level_tag(el):
586 # FIXME: I could do this with XPath, but would that just be 587 # unnecessarily slow? 588 for el in el.getiterator(): 589 if el.tag in defs.block_tags: 590 return True 591 return False
592
593 -def _element_name(el):
594 if isinstance(el, etree.CommentBase): 595 return 'comment' 596 elif isinstance(el, basestring): 597 return 'string' 598 else: 599 return el.tag
600 601 ################################################################################ 602 # form handling 603 ################################################################################ 604
605 -class FormElement(HtmlElement):
606 """ 607 Represents a <form> element. 608 """ 609
610 - def inputs(self):
611 """ 612 Returns an accessor for all the input elements in the form. 613 614 See `InputGetter` for more information about the object. 615 """ 616 return InputGetter(self)
617 inputs = property(inputs, doc=inputs.__doc__) 618
619 - def _fields__get(self):
620 """ 621 Dictionary-like object that represents all the fields in this 622 form. You can set values in this dictionary to effect the 623 form. 624 """ 625 return FieldsDict(self.inputs)
626 - def _fields__set(self, value):
627 prev_keys = self.fields.keys() 628 for key, value in value.iteritems(): 629 if key in prev_keys: 630 prev_keys.remove(key) 631 self.fields[key] = value 632 for key in prev_keys: 633 if key is None: 634 # Case of an unnamed input; these aren't really 635 # expressed in form_values() anyway. 636 continue 637 self.fields[key] = None
638 639 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 640
641 - def _name(self):
642 if self.get('name'): 643 return self.get('name') 644 elif self.get('id'): 645 return '#' + self.get('id') 646 return str(self.body.findall('form').index(self))
647
648 - def form_values(self):
649 """ 650 Return a list of tuples of the field values for the form. 651 This is suitable to be passed to ``urllib.urlencode()``. 652 """ 653 results = [] 654 for el in self.inputs: 655 name = el.name 656 if not name: 657 continue 658 if el.tag == 'textarea': 659 results.append((name, el.value)) 660 elif el.tag == 'select': 661 value = el.value 662 if el.multiple: 663 for v in value: 664 results.append((name, v)) 665 elif value is not None: 666 results.append((name, el.value)) 667 else: 668 assert el.tag == 'input', ( 669 "Unexpected tag: %r" % el) 670 if el.checkable and not el.checked: 671 continue 672 if el.type in ('submit', 'image', 'reset'): 673 continue 674 value = el.value 675 if value is not None: 676 results.append((name, el.value)) 677 return results
678
679 - def _action__get(self):
680 """ 681 Get/set the form's ``action`` attribute. 682 """ 683 base_url = self.base_url 684 action = self.get('action') 685 if base_url and action is not None: 686 return urlparse.urljoin(base_url, action) 687 else: 688 return action
689 - def _action__set(self, value):
690 self.set('action', value)
691 - def _action__del(self):
692 if 'action' in self.attrib: 693 del self.attrib['action']
694 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 695
696 - def _method__get(self):
697 """ 698 Get/set the form's method. Always returns a capitalized 699 string, and defaults to ``'GET'`` 700 """ 701 return self.get('method', 'GET').upper()
702 - def _method__set(self, value):
703 self.set('method', value.upper())
704 method = property(_method__get, _method__set, doc=_method__get.__doc__)
705 706 HtmlElementClassLookup._default_element_classes['form'] = FormElement 707
708 -def submit_form(form, extra_values=None, open_http=None):
709 """ 710 Helper function to submit a form. Returns a file-like object, as from 711 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 712 which shows the URL if there were any redirects. 713 714 You can use this like:: 715 716 form = doc.forms[0] 717 form.inputs['foo'].value = 'bar' # etc 718 response = form.submit() 719 doc = parse(response) 720 doc.make_links_absolute(response.geturl()) 721 722 To change the HTTP requester, pass a function as ``open_http`` keyword 723 argument that opens the URL for you. The function must have the following 724 signature:: 725 726 open_http(method, URL, values) 727 728 The action is one of 'GET' or 'POST', the URL is the target URL as a 729 string, and the values are a sequence of ``(name, value)`` tuples with the 730 form data. 731 """ 732 values = form.form_values() 733 if extra_values: 734 if hasattr(extra_values, 'items'): 735 extra_values = extra_values.items() 736 values.extend(extra_values) 737 if open_http is None: 738 open_http = open_http_urllib 739 return open_http(form.method, form.action, values)
740
741 -def open_http_urllib(method, url, values):
742 import urllib 743 ## FIXME: should test that it's not a relative URL or something 744 if method == 'GET': 745 if '?' in url: 746 url += '&' 747 else: 748 url += '?' 749 url += urllib.urlencode(values) 750 data = None 751 else: 752 data = urllib.urlencode(values) 753 return urllib.urlopen(url, data)
754
755 -class FieldsDict(DictMixin):
756
757 - def __init__(self, inputs):
758 self.inputs = inputs
759 - def __getitem__(self, item):
760 return self.inputs[item].value
761 - def __setitem__(self, item, value):
762 self.inputs[item].value = value
763 - def __delitem__(self, item):
764 raise KeyError( 765 "You cannot remove keys from ElementDict")
766 - def keys(self):
767 return self.inputs.keys()
768 - def __contains__(self, item):
769 return item in self.inputs
770
771 - def __repr__(self):
772 return '<%s for form %s>' % ( 773 self.__class__.__name__, 774 self.inputs.form._name())
775
776 -class InputGetter(object):
777 778 """ 779 An accessor that represents all the input fields in a form. 780 781 You can get fields by name from this, with 782 ``form.inputs['field_name']``. If there are a set of checkboxes 783 with the same name, they are returned as a list (a `CheckboxGroup` 784 which also allows value setting). Radio inputs are handled 785 similarly. 786 787 You can also iterate over this to get all input elements. This 788 won't return the same thing as if you get all the names, as 789 checkboxes and radio elements are returned individually. 790 """ 791 792 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 793 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 794
795 - def __init__(self, form):
796 self.form = form
797
798 - def __repr__(self):
799 return '<%s for form %s>' % ( 800 self.__class__.__name__, 801 self.form._name())
802 803 ## FIXME: there should be more methods, and it's unclear if this is 804 ## a dictionary-like object or list-like object 805
806 - def __getitem__(self, name):
807 results = self._name_xpath(self.form, name=name) 808 if results: 809 type = results[0].get('type') 810 if type == 'radio' and len(results) > 1: 811 group = RadioGroup(results) 812 group.name = name 813 return group 814 elif type == 'checkbox' and len(results) > 1: 815 group = CheckboxGroup(results) 816 group.name = name 817 return group 818 else: 819 # I don't like throwing away elements like this 820 return results[0] 821 else: 822 raise KeyError( 823 "No input element with the name %r" % name)
824
825 - def __contains__(self, name):
826 results = self._name_xpath(self.form, name=name) 827 return bool(results)
828
829 - def keys(self):
830 names = sets.Set() 831 for el in self: 832 if el.name is not None: 833 names.add(el.name) 834 return list(names)
835
836 - def __iter__(self):
837 ## FIXME: kind of dumb to turn a list into an iterator, only 838 ## to have it likely turned back into a list again :( 839 return iter(self._all_xpath(self.form))
840
841 -class InputMixin(object):
842 843 """ 844 Mix-in for all input elements (input, select, and textarea) 845 """ 846 847
848 - def _name__get(self):
849 """ 850 Get/set the name of the element 851 """ 852 return self.get('name')
853 - def _name__set(self, value):
854 self.set('name', value)
855 - def _name__del(self):
856 if 'name' in self.attrib: 857 del self.attrib['name']
858 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 859
860 - def __repr__(self):
861 type = getattr(self, 'type', None) 862 if type: 863 type = ' type=%r' % type 864 else: 865 type = '' 866 return '<%s %x name=%r%s>' % ( 867 self.__class__.__name__, id(self), self.name, type)
868
869 -class TextareaElement(InputMixin, HtmlElement):
870 """ 871 ``<textarea>`` element. You can get the name with ``.name`` and 872 get/set the value with ``.value`` 873 """ 874
875 - def _value__get(self):
876 """ 877 Get/set the value (which is the contents of this element) 878 """ 879 return self.text or ''
880 - def _value__set(self, value):
881 self.text = value
882 - def _value__del(self):
883 self.text = ''
884 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
885 886 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 887
888 -class SelectElement(InputMixin, HtmlElement):
889 """ 890 ``<select>`` element. You can get the name with ``.name``. 891 892 ``.value`` will be the value of the selected option, unless this 893 is a multi-select element (``<select multiple>``), in which case 894 it will be a set-like object. In either case ``.value_options`` 895 gives the possible values. 896 897 The boolean attribute ``.multiple`` shows if this is a 898 multi-select. 899 """ 900
901 - def _value__get(self):
902 """ 903 Get/set the value of this select (the selected option). 904 905 If this is a multi-select, this is a set-like object that 906 represents all the selected options. 907 """ 908 if self.multiple: 909 return MultipleSelectOptions(self) 910 for el in self.getiterator('option'): 911 if 'selected' in el.attrib: 912 value = el.get('value') 913 # FIXME: If value is None, what to return?, get_text()? 914 return value 915 return None
916
917 - def _value__set(self, value):
918 if self.multiple: 919 if isinstance(value, basestring): 920 raise TypeError( 921 "You must pass in a sequence") 922 self.value.clear() 923 self.value.update(value) 924 return 925 if value is not None: 926 for el in self.getiterator('option'): 927 # FIXME: also if el.get('value') is None? 928 if el.get('value') == value: 929 checked_option = el 930 break 931 else: 932 raise ValueError( 933 "There is no option with the value of %r" % value) 934 for el in self.getiterator('option'): 935 if 'selected' in el.attrib: 936 del el.attrib['selected'] 937 if value is not None: 938 checked_option.set('selected', '')
939
940 - def _value__del(self):
941 # FIXME: should del be allowed at all? 942 if self.multiple: 943 self.value.clear() 944 else: 945 self.value = None
946 947 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 948
949 - def value_options(self):
950 """ 951 All the possible values this select can have (the ``value`` 952 attribute of all the ``<option>`` elements. 953 """ 954 return [el.get('value') for el in self.getiterator('option')]
955 value_options = property(value_options, doc=value_options.__doc__) 956
957 - def _multiple__get(self):
958 """ 959 Boolean attribute: is there a ``multiple`` attribute on this element. 960 """ 961 return 'multiple' in self.attrib
962 - def _multiple__set(self, value):
963 if value: 964 self.set('multiple', '') 965 elif 'multiple' in self.attrib: 966 del self.attrib['multiple']
967 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
968 969 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 970
971 -class MultipleSelectOptions(SetMixin):
972 """ 973 Represents all the selected options in a ``<select multiple>`` element. 974 975 You can add to this set-like option to select an option, or remove 976 to unselect the option. 977 """ 978
979 - def __init__(self, select):
980 self.select = select
981
982 - def options(self):
983 """ 984 Iterator of all the ``<option>`` elements. 985 """ 986 return self.select.getiterator('option')
987 options = property(options) 988
989 - def __iter__(self):
990 for option in self.options: 991 yield option.get('value')
992
993 - def add(self, item):
994 for option in self.options: 995 if option.get('value') == item: 996 option.set('selected', '') 997 break 998 else: 999 raise ValueError( 1000 "There is no option with the value %r" % item)
1001
1002 - def remove(self, item):
1003 for option in self.options: 1004 if option.get('value') == item: 1005 if 'selected' in option.attrib: 1006 del option.attrib['selected'] 1007 else: 1008 raise ValueError( 1009 "The option %r is not currently selected" % item) 1010 break 1011 else: 1012 raise ValueError( 1013 "There is not option with the value %r" % item)
1014
1015 - def __repr__(self):
1016 return '<%s {%s} for select name=%r>' % ( 1017 self.__class__.__name__, 1018 ', '.join([repr(v) for v in self]), 1019 self.select.name)
1020
1021 -class RadioGroup(list):
1022 """ 1023 This object represents several ``<input type=radio>`` elements 1024 that have the same name. 1025 1026 You can use this like a list, but also use the property 1027 ``.value`` to check/uncheck inputs. Also you can use 1028 ``.value_options`` to get the possible values. 1029 """ 1030
1031 - def _value__get(self):
1032 """ 1033 Get/set the value, which checks the radio with that value (and 1034 unchecks any other value). 1035 """ 1036 for el in self: 1037 if 'checked' in el.attrib: 1038 return el.get('value') 1039 return None
1040
1041 - def _value__set(self, value):
1042 if value is not None: 1043 for el in self: 1044 if el.get('value') == value: 1045 checked_option = el 1046 break 1047 else: 1048 raise ValueError( 1049 "There is no radio input with the value %r" % value) 1050 for el in self: 1051 if 'checked' in el.attrib: 1052 del el.attrib['checked'] 1053 if value is not None: 1054 checked_option.set('checked', '')
1055
1056 - def _value__del(self):
1057 self.value = None
1058 1059 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1060
1061 - def value_options(self):
1062 """ 1063 Returns a list of all the possible values. 1064 """ 1065 return [el.get('value') for el in self]
1066 value_options = property(value_options, doc=value_options.__doc__) 1067
1068 - def __repr__(self):
1069 return '%s(%s)' % ( 1070 self.__class__.__name__, 1071 list.__repr__(self))
1072
1073 -class CheckboxGroup(list):
1074 """ 1075 Represents a group of checkboxes (``<input type=checkbox>``) that 1076 have the same name. 1077 1078 In addition to using this like a list, the ``.value`` attribute 1079 returns a set-like object that you can add to or remove from to 1080 check and uncheck checkboxes. You can also use ``.value_options`` 1081 to get the possible values. 1082 """ 1083
1084 - def _value__get(self):
1085 """ 1086 Return a set-like object that can be modified to check or 1087 uncheck individual checkboxes according to their value. 1088 """ 1089 return CheckboxValues(self)
1090 - def _value__set(self, value):
1091 self.value.clear() 1092 if not hasattr(value, '__iter__'): 1093 raise ValueError( 1094 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1095 % (self[0].name, value)) 1096 self.value.update(value)
1097 - def _value__del(self):
1098 self.value.clear()
1099 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1100
1101 - def __repr__(self):
1102 return '%s(%s)' % ( 1103 self.__class__.__name__, list.__repr__(self))
1104
1105 -class CheckboxValues(SetMixin):
1106 1107 """ 1108 Represents the values of the checked checkboxes in a group of 1109 checkboxes with the same name. 1110 """ 1111
1112 - def __init__(self, group):
1113 self.group = group
1114
1115 - def __iter__(self):
1116 return iter([ 1117 el.get('value') 1118 for el in self.group 1119 if 'checked' in el.attrib])
1120
1121 - def add(self, value):
1122 for el in self.group: 1123 if el.get('value') == value: 1124 el.set('checked', '') 1125 break 1126 else: 1127 raise KeyError("No checkbox with value %r" % value)
1128
1129 - def remove(self, value):
1130 for el in self.group: 1131 if el.get('value') == value: 1132 if 'checked' in el.attrib: 1133 del el.attrib['checked'] 1134 else: 1135 raise KeyError( 1136 "The checkbox with value %r was already unchecked" % value) 1137 break 1138 else: 1139 raise KeyError( 1140 "No checkbox with value %r" % value)
1141
1142 - def __repr__(self):
1143 return '<%s {%s} for checkboxes name=%r>' % ( 1144 self.__class__.__name__, 1145 ', '.join([repr(v) for v in self]), 1146 self.group.name)
1147
1148 -class InputElement(InputMixin, HtmlElement):
1149 """ 1150 Represents an ``<input>`` element. 1151 1152 You can get the type with ``.type`` (which is lower-cased and 1153 defaults to ``'text'``). 1154 1155 Also you can get and set the value with ``.value`` 1156 1157 Checkboxes and radios have the attribute ``input.checkable == 1158 True`` (for all others it is false) and a boolean attribute 1159 ``.checked``. 1160 1161 """ 1162 1163 ## FIXME: I'm a little uncomfortable with the use of .checked
1164 - def _value__get(self):
1165 """ 1166 Get/set the value of this element, using the ``value`` attribute. 1167 1168 Also, if this is a checkbox and it has no value, this defaults 1169 to ``'on'``. If it is a checkbox or radio that is not 1170 checked, this returns None. 1171 """ 1172 if self.checkable: 1173 if self.checked: 1174 return self.get('value') or 'on' 1175 else: 1176 return None 1177 return self.get('value')
1178 - def _value__set(self, value):
1179 if self.checkable: 1180 if not value: 1181 self.checked = False 1182 else: 1183 self.checked = True 1184 if isinstance(value, basestring): 1185 self.set('value', value) 1186 else: 1187 self.set('value', value)
1188 - def _value__del(self):
1189 if self.checkable: 1190 self.checked = False 1191 else: 1192 if 'value' in self.attrib: 1193 del self.attrib['value']
1194 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1195
1196 - def _type__get(self):
1197 """ 1198 Return the type of this element (using the type attribute). 1199 """ 1200 return self.get('type', 'text').lower()
1201 - def _type__set(self, value):
1202 self.set('type', value)
1203 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1204
1205 - def checkable(self):
1206 """ 1207 Boolean: can this element be checked? 1208 """ 1209 return self.type in ['checkbox', 'radio']
1210 checkable = property(checkable, doc=checkable.__doc__) 1211
1212 - def _checked__get(self):
1213 """ 1214 Boolean attribute to get/set the presence of the ``checked`` 1215 attribute. 1216 1217 You can only use this on checkable input types. 1218 """ 1219 if not self.checkable: 1220 raise AttributeError('Not a checkable input type') 1221 return 'checked' in self.attrib
1222 - def _checked__set(self, value):
1223 if not self.checkable: 1224 raise AttributeError('Not a checkable input type') 1225 if value: 1226 self.set('checked', '') 1227 else: 1228 if 'checked' in self.attrib: 1229 del self.attrib['checked']
1230 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1231 1232 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1233
1234 -class LabelElement(HtmlElement):
1235 """ 1236 Represents a ``<label>`` element. 1237 1238 Label elements are linked to other elements with their ``for`` 1239 attribute. You can access this element with ``label.for_element``. 1240 """ 1241
1242 - def _for_element__get(self):
1243 """ 1244 Get/set the element this label points to. Return None if it 1245 can't be found. 1246 """ 1247 id = self.get('for') 1248 if not id: 1249 return None 1250 return self.body.get_element_by_id(id)
1251 - def _for_element__set(self, other):
1252 id = other.get('id') 1253 if not id: 1254 raise TypeError( 1255 "Element %r has no id attribute" % other) 1256 self.set('for', id)
1257 - def _for_element__del(self):
1258 if 'id' in self.attrib: 1259 del self.attrib['id']
1260 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1261 doc=_for_element__get.__doc__)
1262 1263 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1264 1265 ############################################################ 1266 ## Serialization 1267 ############################################################ 1268 1269 # This isn't a general match, but it's a match for what libxml2 1270 # specifically serialises: 1271 __replace_meta_content_type = re.compile( 1272 r'<meta http-equiv="Content-Type".*?>').sub 1273
1274 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1275 encoding=None, method="html"):
1276 """Return an HTML string representation of the document. 1277 1278 Note: if include_meta_content_type is true this will create a 1279 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1280 regardless of the value of include_meta_content_type any existing 1281 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1282 1283 The ``encoding`` argument controls the output encoding (defauts to 1284 ASCII, with &#...; character references for any characters outside 1285 of ASCII). 1286 1287 The ``method`` argument defines the output method. It defaults to 1288 'html', but can also be 'xml' for xhtml output, or 'text' to 1289 serialise to plain text without markup. Note that you can pass 1290 the builtin ``unicode`` type as ``encoding`` argument to serialise 1291 to a unicode string. 1292 1293 Example:: 1294 1295 >>> from lxml import html 1296 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1297 1298 >>> html.tostring(root) 1299 '<p>Hello<br>world!</p>' 1300 >>> html.tostring(root, method='html') 1301 '<p>Hello<br>world!</p>' 1302 1303 >>> html.tostring(root, method='xml') 1304 '<p>Hello<br/>world!</p>' 1305 1306 >>> html.tostring(root, method='text') 1307 'Helloworld!' 1308 1309 >>> html.tostring(root, method='text', encoding=unicode) 1310 u'Helloworld!' 1311 """ 1312 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1313 encoding=encoding) 1314 if not include_meta_content_type: 1315 html = __replace_meta_content_type('', html) 1316 return html
1317
1318 -def open_in_browser(doc):
1319 """ 1320 Open the HTML document in a web browser (saving it to a temporary 1321 file to open it). 1322 """ 1323 import os 1324 import webbrowser 1325 try: 1326 write_doc = doc.write 1327 except AttributeError: 1328 write_doc = etree.ElementTree(element=doc).write 1329 fn = os.tempnam() + '.html' 1330 write_doc(fn, method="html") 1331 url = 'file://' + fn.replace(os.path.sep, '/') 1332 print url 1333 webbrowser.open(url)
1334 1335 ################################################################################ 1336 # configure Element class lookup 1337 ################################################################################ 1338
1339 -class HTMLParser(etree.HTMLParser):
1340 - def __init__(self, **kwargs):
1341 super(HTMLParser, self).__init__(**kwargs) 1342 self.setElementClassLookup(HtmlElementClassLookup())
1343
1344 -def Element(*args, **kw):
1345 v = html_parser.makeelement(*args, **kw) 1346 return v
1347 1348 html_parser = HTMLParser() 1349