third_party/web_platform_tests/tools/html5lib/utils/spider.py - cobalt - Git at Google

 #!/usr/bin/env python
 """Spider to try and find bugs in the parser. Requires httplib2 and elementtree

 usage:
 import spider
 s = spider.Spider()
 s.spider("http://www.google.com", maxURLs=100)
 """

 import urllib.request, urllib.error, urllib.parse
 import urllib.robotparser
 import md5

 import httplib2

 import html5lib
 from html5lib.treebuilders import etree

 class Spider(object):
     def __init__(self):
         self.unvisitedURLs = set()
         self.visitedURLs = set()
         self.buggyURLs=set()
         self.robotParser = urllib.robotparser.RobotFileParser()
         self.contentDigest = {}
         self.http = httplib2.Http(".cache")

     def run(self, initialURL, maxURLs=1000):
         urlNumber = 0
         self.visitedURLs.add(initialURL)
         content = self.loadURL(initialURL)
         while maxURLs is None or urlNumber < maxURLs:
             if content is not None:
                 self.parse(content)
                 urlNumber += 1
             if not self.unvisitedURLs:
                 break
             content = self.loadURL(self.unvisitedURLs.pop())

     def parse(self, content):
         failed = False
         p = html5lib.HTMLParser(tree=etree.TreeBuilder)
         try:
             tree = p.parse(content)
         except:
             self.buggyURLs.add(self.currentURL)
             failed = True
             print("BUGGY:", self.currentURL)
         self.visitedURLs.add(self.currentURL)
         if not failed:
             self.updateURLs(tree)

     def loadURL(self, url):
         resp, content = self.http.request(url, "GET")
         self.currentURL = url
         digest = md5.md5(content).hexdigest()
         if digest in self.contentDigest:
             content = None
             self.visitedURLs.add(url)
         else:
             self.contentDigest[digest] = url

         if resp['status'] != "200":
             content = None

         return content

     def updateURLs(self, tree):
         """Take all the links in the current document, extract the URLs and
         update the list of visited and unvisited URLs according to whether we
         have seen them before or not"""
         urls = set()
         #Remove all links we have already visited
         for link in tree.findall(".//a"):
                 try:
                     url = urllib.parse.urldefrag(link.attrib['href'])[0]
                     if (url and url not in self.unvisitedURLs and url
                         not in self.visitedURLs):
                         urls.add(url)
                 except KeyError:
                     pass

         #Remove all non-http URLs and a dd a sutiable base URL where that is
         #missing
         newUrls = set()
         for url in urls:
             splitURL = list(urllib.parse.urlsplit(url))
             if splitURL[0] != "http":
                 continue
             if splitURL[1] == "":
                 splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
             newUrls.add(urllib.parse.urlunsplit(splitURL))
         urls = newUrls

         responseHeaders = {}
         #Now we want to find the content types of the links we haven't visited
         for url in urls:
             try:
                 resp, content = self.http.request(url, "HEAD")
                 responseHeaders[url] = resp
             except AttributeError as KeyError:
                 #Don't know why this happens
                 pass


         #Remove links not of content-type html or pages not found
         #XXX - need to deal with other status codes?
         toVisit = set([url for url in urls if url in responseHeaders and
                       "html" in responseHeaders[url]['content-type'] and
                       responseHeaders[url]['status'] == "200"])

         #Now check we are allowed to spider the page
         for url in toVisit:
             robotURL = list(urllib.parse.urlsplit(url)[:2])
             robotURL.extend(["robots.txt", "", ""])
             robotURL = urllib.parse.urlunsplit(robotURL)
             self.robotParser.set_url(robotURL)
             if not self.robotParser.can_fetch("*", url):
                 toVisit.remove(url)

         self.visitedURLs.update(urls)
         self.unvisitedURLs.update(toVisit)
	#!/usr/bin/env python
	"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree

	usage:
	import spider
	s = spider.Spider()
	s.spider("http://www.google.com", maxURLs=100)
	"""

	import urllib.request, urllib.error, urllib.parse
	import urllib.robotparser
	import md5

	import httplib2

	import html5lib
	from html5lib.treebuilders import etree

	class Spider(object):
	def __init__(self):
	self.unvisitedURLs = set()
	self.visitedURLs = set()
	self.buggyURLs=set()
	self.robotParser = urllib.robotparser.RobotFileParser()
	self.contentDigest = {}
	self.http = httplib2.Http(".cache")

	def run(self, initialURL, maxURLs=1000):
	urlNumber = 0
	self.visitedURLs.add(initialURL)
	content = self.loadURL(initialURL)
	while maxURLs is None or urlNumber < maxURLs:
	if content is not None:
	self.parse(content)
	urlNumber += 1
	if not self.unvisitedURLs:
	break
	content = self.loadURL(self.unvisitedURLs.pop())

	def parse(self, content):
	failed = False
	p = html5lib.HTMLParser(tree=etree.TreeBuilder)
	try:
	tree = p.parse(content)
	except:
	self.buggyURLs.add(self.currentURL)
	failed = True
	print("BUGGY:", self.currentURL)
	self.visitedURLs.add(self.currentURL)
	if not failed:
	self.updateURLs(tree)

	def loadURL(self, url):
	resp, content = self.http.request(url, "GET")
	self.currentURL = url
	digest = md5.md5(content).hexdigest()
	if digest in self.contentDigest:
	content = None
	self.visitedURLs.add(url)
	else:
	self.contentDigest[digest] = url

	if resp['status'] != "200":
	content = None

	return content

	def updateURLs(self, tree):
	"""Take all the links in the current document, extract the URLs and
	update the list of visited and unvisited URLs according to whether we
	have seen them before or not"""
	urls = set()
	#Remove all links we have already visited
	for link in tree.findall(".//a"):
	try:
	url = urllib.parse.urldefrag(link.attrib['href'])[0]
	if (url and url not in self.unvisitedURLs and url
	not in self.visitedURLs):
	urls.add(url)
	except KeyError:
	pass

	#Remove all non-http URLs and a dd a sutiable base URL where that is
	#missing
	newUrls = set()
	for url in urls:
	splitURL = list(urllib.parse.urlsplit(url))
	if splitURL[0] != "http":
	continue
	if splitURL[1] == "":
	splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
	newUrls.add(urllib.parse.urlunsplit(splitURL))
	urls = newUrls

	responseHeaders = {}
	#Now we want to find the content types of the links we haven't visited
	for url in urls:
	try:
	resp, content = self.http.request(url, "HEAD")
	responseHeaders[url] = resp
	except AttributeError as KeyError:
	#Don't know why this happens
	pass


	#Remove links not of content-type html or pages not found
	#XXX - need to deal with other status codes?
	toVisit = set([url for url in urls if url in responseHeaders and
	"html" in responseHeaders[url]['content-type'] and
	responseHeaders[url]['status'] == "200"])

	#Now check we are allowed to spider the page
	for url in toVisit:
	robotURL = list(urllib.parse.urlsplit(url)[:2])
	robotURL.extend(["robots.txt", "", ""])
	robotURL = urllib.parse.urlunsplit(robotURL)
	self.robotParser.set_url(robotURL)
	if not self.robotParser.can_fetch("*", url):
	toVisit.remove(url)

	self.visitedURLs.update(urls)
	self.unvisitedURLs.update(toVisit)