Pruebas HTMLparser

428 days ago by mabanades

%hide html('<script type="text/javascript">var texto = "";function show(){texto = "algo"; document.getElementById("Area").innerHTML = texto;}</script>') 
       

Pruebas HTML parser.

March 2011

Press the Show algo button to obtain "algo" in the text area.


Click evaluate. If you do not see evaluate, click %hide and shift+enter. Once the text field is generated, paste in it the the XML description copied from in the text area above

%hide automatic_names(True) @interact def read_xml(XML_from_construction='Replace this text with the XML desciption in the text area above'): global texto texto=XML_from_construction 
       

Click to the left again to hide and once more to show the dynamic interactive window

html('<script type="text/javascript">var texto = "";function show(){texto = "algo"; document.getElementById("Area").innerHTML = texto;}</script>') 
       
texto 
       
'Replace this text with the XML desciption in the text area above'
'Replace this text with the XML desciption in the text area above'

Click evaluate. If you do not see evaluate, click %hide and shift+enter. This cell carries out the main computation. Scroll down (if needed), take a look to the code, and goto to the next cell.

import urllib # Get a file-like object for the Python Web site's home page. f = urllib.urlopen("http://gie.cesfelipesegundo.com") # Read from the object, storing the page's contents in 's'. s = f.read() #print s f.close() 
       
import sgmllib class MyParser(sgmllib.SGMLParser): "A simple parser class." def parse(self, s): "Parse the given string 's'." self.feed(s) self.close() def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] def start_a(self, attributes): "Process a hyperlink and its 'attributes'." for name, value in attributes: if name == "href": self.hyperlinks.append(value) def get_hyperlinks(self): "Return the list of hyperlinks." return self.hyperlinks import urllib, sgmllib # Get something to work with. f = urllib.urlopen('http://alpha.sagenb.org/home/pub/93') s = f.read() # Try and process the page. # The class should have been defined first, remember. myparser = MyParser() myparser.parse(s) # Get the hyperlinks. ##print s print myparser.get_hyperlinks() 
       
['/', 'download/Pruebas HTMLparser.sws', '/pub/']
['/', 'download/Pruebas HTMLparser.sws', '/pub/']
dir(s) 
       
['__add__', '__class__', '__contains__', '__delattr__', '__doc__',
'__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__',
'__getnewargs__', '__getslice__', '__gt__', '__hash__', '__init__',
'__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__',
'__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__',
'__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__',
'_formatter_field_name_split', '_formatter_parser', 'capitalize',
'center', 'count', 'decode', 'encode', 'endswith', 'expandtabs', 'find',
'format', 'index', 'isalnum', 'isalpha', 'isdigit', 'islower',
'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip',
'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition',
'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip',
'swapcase', 'title', 'translate', 'upper', 'zfill']
['__add__', '__class__', '__contains__', '__delattr__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getslice__', '__gt__', '__hash__', '__init__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_formatter_field_name_split', '_formatter_parser', 'capitalize', 'center', 'count', 'decode', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'index', 'isalnum', 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']
s.__contains__('Nada') 
       
True
True
s?? 
       
Source code for 

    
        
        Pruebas HTMLparser -- Sage
        
        
        
        
        
        
        







































        
        
        
        
    
    
        


Pruebas HTMLparser

1 minute ago by mabanades

%hide html('<script type="text/javascript">var texto = "";function show(){texto = "algo"; document.getElementById("Area").innerHTML = texto;}</script>') 
       

Pruebas HTML parser.

March 2011

Press the Show algo button to obtain "algo" in the text area.


Click evaluate. If you do not see evaluate, click %hide and shift+enter. Once the text field is generated, paste in it the the XML description copied from in the text area above

%hide automatic_names(True) @interact def read_xml(XML_from_construction='Replace this text with the XML desciption in the text area above'): global texto texto=XML_from_construction 
       
html('<script type="text/javascript">var texto = "";function show(){texto = "algo"; document.getElementById("Area").innerHTML = texto;}</script>') 
       
texto 
       
'Replace this text with the XML desciption in the text area above'

Click evaluate. If you do not see evaluate, click %hide and shift+enter. This cell carries out the main computation. Scroll down (if needed), take a look to the code, and goto to the next cell.

import urllib # Get a file-like object for the Python Web site's home page. f = urllib.urlopen("http://gie.cesfelipesegundo.com") # Read from the object, storing the page's contents in 's'. s = f.read() #print s f.close() 
       
import sgmllib class MyParser(sgmllib.SGMLParser): "A simple parser class." def parse(self, s): "Parse the given string 's'." self.feed(s) self.close() def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] def start_a(self, attributes): "Process a hyperlink and its 'attributes'." for name, value in attributes: if name == "href": self.hyperlinks.append(value) def get_hyperlinks(self): "Return the list of hyperlinks." return self.hyperlinks import urllib, sgmllib # Get something to work with. f = urllib.urlopen('http://elpais.com') s = f.read() # Try and process the page. # The class should have been defined first, remember. myparser = MyParser() myparser.parse(s) # Get the hyperlinks. ###print s ###print myparser.get_hyperlinks() 
       
dir(s) 
       
['__add__', '__class__', '__contains__', '__delattr__', '__doc__',
'__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__',
'__getnewargs__', '__getslice__', '__gt__', '__hash__', '__init__',
'__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__',
'__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__',
'__rmul__', '__setattr__', '__sizeof__', '__str__',
'__subclasshook__', '_formatter_field_name_split',
'_formatter_parser', 'capitalize', 'center', 'count', 'decode',
'encode', 'endswith', 'expandtabs', 'find', 'format', 'index',
'isalnum', 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle',
'isupper', 'join', 'ljust', 'lower', 'lstrip', 'partition',
'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit',
'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase',
'title', 'translate', 'upper', 'zfill']
import HTMLParser h = HTMLParser.HTMLParser() a = '<textarea>document.getElementById("Area")</textarea>' h.feed(a) print h.get_starttag_text() h.close() 
       
<textarea>
dir(h) 
       
['CDATA_CONTENT_ELEMENTS', '_HTMLParser__starttag_text', '__doc__',
'__init__', '__module__', '_decl_otherchars',
'_parse_doctype_attlist', '_parse_doctype_element',
'_parse_doctype_entity', '_parse_doctype_notation',
'_parse_doctype_subset', '_scan_name', 'check_for_whole_start_tag',
'clear_cdata_mode', 'close', 'entitydefs', 'error', 'feed',
'get_starttag_text', 'getpos', 'goahead', 'handle_charref',
'handle_comment', 'handle_data', 'handle_decl', 'handle_endtag',
'handle_entityref', 'handle_pi', 'handle_startendtag',
'handle_starttag', 'interesting', 'lasttag', 'lineno', 'offset',
'parse_comment', 'parse_declaration', 'parse_endtag',
'parse_marked_section', 'parse_pi', 'parse_starttag', 'rawdata',
'reset', 'set_cdata_mode', 'unescape', 'unknown_decl', 'updatepos']
 
       

                    not available.</pre></div> </html>
Source code for 

    
        
        Pruebas HTMLparser -- Sage
        
        
        
        
        
        
        







































        
        
        
        
    
    
        


Pruebas HTMLparser

1 minute ago by mabanades

%hide html('<script type="text/javascript">var texto = "";function show(){texto = "algo"; document.getElementById("Area").innerHTML = texto;}</script>') 
       

Pruebas HTML parser.

March 2011

Press the Show algo button to obtain "algo" in the text area.


Click evaluate. If you do not see evaluate, click %hide and shift+enter. Once the text field is generated, paste in it the the XML description copied from in the text area above

%hide automatic_names(True) @interact def read_xml(XML_from_construction='Replace this text with the XML desciption in the text area above'): global texto texto=XML_from_construction 
       
html('<script type="text/javascript">var texto = "";function show(){texto = "algo"; document.getElementById("Area").innerHTML = texto;}</script>') 
       
texto 
       
'Replace this text with the XML desciption in the text area above'

Click evaluate. If you do not see evaluate, click %hide and shift+enter. This cell carries out the main computation. Scroll down (if needed), take a look to the code, and goto to the next cell.

import urllib # Get a file-like object for the Python Web site's home page. f = urllib.urlopen("http://gie.cesfelipesegundo.com") # Read from the object, storing the page's contents in 's'. s = f.read() #print s f.close() 
       
import sgmllib class MyParser(sgmllib.SGMLParser): "A simple parser class." def parse(self, s): "Parse the given string 's'." self.feed(s) self.close() def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.hyperlinks = [] def start_a(self, attributes): "Process a hyperlink and its 'attributes'." for name, value in attributes: if name == "href": self.hyperlinks.append(value) def get_hyperlinks(self): "Return the list of hyperlinks." return self.hyperlinks import urllib, sgmllib # Get something to work with. f = urllib.urlopen('http://elpais.com') s = f.read() # Try and process the page. # The class should have been defined first, remember. myparser = MyParser() myparser.parse(s) # Get the hyperlinks. ###print s ###print myparser.get_hyperlinks() 
       
dir(s) 
       
['__add__', '__class__', '__contains__', '__delattr__', '__doc__',
'__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__',
'__getnewargs__', '__getslice__', '__gt__', '__hash__', '__init__',
'__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__',
'__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__',
'__rmul__', '__setattr__', '__sizeof__', '__str__',
'__subclasshook__', '_formatter_field_name_split',
'_formatter_parser', 'capitalize', 'center', 'count', 'decode',
'encode', 'endswith', 'expandtabs', 'find', 'format', 'index',
'isalnum', 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle',
'isupper', 'join', 'ljust', 'lower', 'lstrip', 'partition',
'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit',
'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase',
'title', 'translate', 'upper', 'zfill']
import HTMLParser h = HTMLParser.HTMLParser() a = '<textarea>document.getElementById("Area")</textarea>' h.feed(a) print h.get_starttag_text() h.close() 
       
<textarea>
dir(h) 
       
['CDATA_CONTENT_ELEMENTS', '_HTMLParser__starttag_text', '__doc__',
'__init__', '__module__', '_decl_otherchars',
'_parse_doctype_attlist', '_parse_doctype_element',
'_parse_doctype_entity', '_parse_doctype_notation',
'_parse_doctype_subset', '_scan_name', 'check_for_whole_start_tag',
'clear_cdata_mode', 'close', 'entitydefs', 'error', 'feed',
'get_starttag_text', 'getpos', 'goahead', 'handle_charref',
'handle_comment', 'handle_data', 'handle_decl', 'handle_endtag',
'handle_entityref', 'handle_pi', 'handle_startendtag',
'handle_starttag', 'interesting', 'lasttag', 'lineno', 'offset',
'parse_comment', 'parse_declaration', 'parse_endtag',
'parse_marked_section', 'parse_pi', 'parse_starttag', 'rawdata',
'reset', 'set_cdata_mode', 'unescape', 'unknown_decl', 'updatepos']
 
       

                    not available.</pre></div> </html>
import HTMLParser h = HTMLParser.HTMLParser() a = '<textarea>document.getElementById("Area")</textarea>' h.feed(a) print h.get_starttag_text() h.close() 
       
<textarea>
<textarea>
dir(h) 
       
['CDATA_CONTENT_ELEMENTS', '_HTMLParser__starttag_text', '__doc__',
'__init__', '__module__', '_decl_otherchars', '_parse_doctype_attlist',
'_parse_doctype_element', '_parse_doctype_entity',
'_parse_doctype_notation', '_parse_doctype_subset', '_scan_name',
'check_for_whole_start_tag', 'clear_cdata_mode', 'close', 'entitydefs',
'error', 'feed', 'get_starttag_text', 'getpos', 'goahead',
'handle_charref', 'handle_comment', 'handle_data', 'handle_decl',
'handle_endtag', 'handle_entityref', 'handle_pi', 'handle_startendtag',
'handle_starttag', 'interesting', 'lasttag', 'lineno', 'offset',
'parse_comment', 'parse_declaration', 'parse_endtag',
'parse_marked_section', 'parse_pi', 'parse_starttag', 'rawdata',
'reset', 'set_cdata_mode', 'unescape', 'unknown_decl', 'updatepos']
['CDATA_CONTENT_ELEMENTS', '_HTMLParser__starttag_text', '__doc__', '__init__', '__module__', '_decl_otherchars', '_parse_doctype_attlist', '_parse_doctype_element', '_parse_doctype_entity', '_parse_doctype_notation', '_parse_doctype_subset', '_scan_name', 'check_for_whole_start_tag', 'clear_cdata_mode', 'close', 'entitydefs', 'error', 'feed', 'get_starttag_text', 'getpos', 'goahead', 'handle_charref', 'handle_comment', 'handle_data', 'handle_decl', 'handle_endtag', 'handle_entityref', 'handle_pi', 'handle_startendtag', 'handle_starttag', 'interesting', 'lasttag', 'lineno', 'offset', 'parse_comment', 'parse_declaration', 'parse_endtag', 'parse_marked_section', 'parse_pi', 'parse_starttag', 'rawdata', 'reset', 'set_cdata_mode', 'unescape', 'unknown_decl', 'updatepos']