Topics/Material/Reading for Week 4

This week is based on Lessons 3 and 4 of the Udacity course CS101 "Introduction to Computer Science". It would be best if you sign up and watch these lectures.

We will cover:

Check out Subsection 5.5. "Dictionaries" of the Python Tutorial.

Also, check out Subsection 4.4 "Dictionaries" of the Programming Knights book. Here is the link to Chapter 4.

Collecting Links

The task is to write a function that collects all the URLs in a given web page (passed as input to the function) and returns a list of these URLs.

Change Print All Links To Get All Links

We seek to obtain a list with all the URLs instead of printing out all URLs.

def get_next_target(s):
    start_link = s.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = s.find('"', start_link)
    end_quote = s.find('"', start_quote + 1)
    url = s[start_quote + 1 : end_quote]
    return url, end_quote

def print_all_links(page):                   
  while True:
      url, endpos = get_next_target(page)
      if url:
          print(url)
          page = page[endpos:]
      else:
          break

Change Print All Links To Get All Links - Programming Quiz

We seek to obtain a list with all the URLs instead of printing out all URLs.

def print_all_links(page):                   
  # initialize varibable links
  links =             
  while True:
      url, endpos = get_next_target(page)
      if url:
          print(url)  # change this line
          page = page[endpos:]
      else:
          break
  # complete the return statement
  return

Change Print All Links To Get All Links - Solution to Programming Quiz

We seek to obtain a list with all the URLs instead of printing out all URLs.

def print_all_links(page):                   
  links = []
  while True:
      url, endpos = get_next_target(page)
      if url:
          links.append(url)
          page = page[endpos:]
      else:
          break
  return links

Finishing the Web Crawler

# start with tocrawl = [seed]
# crawled = []
# while there are more page tocrawl:
#     pick a page from tocrawl
#     add that page to crawled
#     add all the link targets on this page to tocrawl
# return crawled

Finishing the Web Crawler - Programming Quiz Step 1

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page

def crawl_web(seed):
    tocrawl = # initialize correctly
    crawled = # initialize correctly
    # don't worry about the rest of the code

Finishing the Web Crawler - Solution to Programming Quiz Step 1

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    # don't worry about the rest of the code

Finishing the Web Crawler - Programming Quiz Step 2

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        url = # retrieve the last URL from the list tocrawl and assign it to the variable URL
        # if you use the correct list operation, 
        # you can remove this URL from the list tocrawl at the same time
        # don't worry about the rest of the code

Finishing the Web Crawler - Solution to Programming Quiz Step 2

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        url = tocrawl.pop()
        # don't worry about the rest of the code

Finishing the Web Crawler - Programming Quiz Step 3

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        url = tocrawl.pop()
        if <test>: # crawl only if we have not yet crawled this URL
                   # fill in the correct test
        # don't worry about the rest of the code

Finishing the Web Crawler - Solution to Programming Quiz Step 3

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled: 
        # don't worry about the rest of the code

Finishing the Web Crawler - Programming Quiz Step 4

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page.

import urllib.request

def get_page(url):
   response = urllib.request.urlopen(url)
   return response.read().decode("utf-8")

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled: 
          # fill in the missing code using 
          # the functions union, get_all_links, get_page  
    return crawled

print(crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html'))

Finishing the Web Crawler - Solution to Programming Quiz Step 4 - crawler.py

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page.

import urllib.request

def get_page(url):
   response = urllib.request.urlopen(url)
   return response.read().decode("utf-8")

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled: 
          union(tocrawl, get_all_links(get_page(url)))
          crawled.append(url)
    return crawled

def main():
    print(crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html'))

main()

Finishing the Web Crawler - Drawing the URL Graph - crawler_graph.py

We will use the package networkx. Check out the networkx tutorial.

# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page.

import urllib.request

import networkx as nx # package for working with graphs

# copy code from previous slide
# omitted for brevity 

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    # initialize G to be an empty directed graph
    G=nx.DiGraph() 
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled: 
          all_links = get_all_links(get_page(url))
          union(tocrawl, all_links)
          crawled.append(url)
          # add a node to the graph G and name it using the value in url
          G.add_node(url)
          # for each link URL u in the web page corresponding to url,
          # add the directed edge of the form current URL -> link URL
          for u in all_links:
              G.add_edge(url, u) # to name the edge, use the values stored in the variables url and u
    return crawled, G

def main():
    crawled, G = crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html')
    print(crawled)    
    nx.draw(G)

main()

Building the Index - add_to_index

# Define a procedure, add_to_index,
# that takes 3 inputs:

# - an index: [[,[,...]],...]
# - a keyword: String
# - a url: String

# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.

# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]

def add_to_index(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword,[url]])

Building the Index - lookup

# Define a procedure, lookup,
# that takes two inputs:

# - an index
# - keyword

# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.

def lookup(index,keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return []

Building the Index - add_page_to_index

# Define a procedure, add_page_to_index,
# that takes three inputs:

#   - index
#   - url (String)
#   - content (String)

# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.    
    
def add_page_to_index(index,url,content):
    keywords = content.split()
    for keyword in keywords:
        add_to_index(index,keyword,url)

Building the Index - Testing the Procedures - index.py

# Define a procedure, add_to_index,
# that takes 3 inputs:

# - an index: [[,[,...]],...]
# - a keyword: String
# - a url: String

# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.

# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]

def add_to_index(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword,[url]])
                     
# Define a procedure, lookup,
# that takes two inputs:

# - an index
# - keyword

# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.

def lookup(index,keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return []

# Define a procedure, add_page_to_index,
# that takes three inputs:

#   - index
#   - url (String)
#   - content (String)

# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.    
    
def add_page_to_index(index,url,content):
    keywords = content.split()
    for keyword in keywords:
        add_to_index(index,keyword,url)

index = []

add_to_index(index,'ucf','http://ucf.edu')
add_to_index(index,'quantum','http://mit.edu')
add_to_index(index,'computing','http://mit.edu')
print(index)

print(lookup(index,'ucf'))
print(lookup(index,'quantum'))

Building the Index While Crawling - crawl_web_building_index.py

def crawl_web(seed):
  tocrawl = [seed]
  crawled = []
  index = []
  while tocrawl:
    url = tocrawl.pop()
    if url not in crawled:
      content = get_page(url)
      add_page_to_index(index, url, content)
      union(tocrawl, get_all_links(content))
      crawled.append(url)

  return index

Building the Index While Crawling - Putting Everything Together - crawler_index_builder.py

import urllib.request

# Define a procedure, add_to_index,
# that takes 3 inputs:

# - an index: [[,[,...]],...]
# - a keyword: String
# - a url: String

# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.

# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]

def add_to_index(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword,[url]])
                     
# Define a procedure, lookup,
# that takes two inputs:

# - an index
# - keyword

# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.

def lookup(index,keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return []

# Define a procedure, add_page_to_index,
# that takes three inputs:

#   - index
#   - url (String)
#   - content (String)

# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.    
    
def add_page_to_index(index,url,content):
    keywords = content.split()
    for keyword in keywords:
        add_to_index(index,keyword,url)

#
#
#

def get_page(url):
   response = urllib.request.urlopen(url)
   return response.read().decode("utf-8")

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links 

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(index, url, content)
            union(tocrawl, get_all_links(content))
            crawled.append(url)

    return index

# commented out definition of main on purpose so we can access index in the shell
# def main():

index = crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html')
print(index)       

#main()

Measuring Execution Time

We will be using the Python module module time.

import time

def time_execution(code):
    start = time.clock()
    result = eval(code)
    run_time = time.clock() - start
    return result, run_time

# for instance, on my computer I get
# >>> time_execution('2+2')
# (4, 8.623211838437968e-05)

# the first element 4 is the result of the code '2+2' and
# the second element 8.623211838437968e-05 it takes to execute
# this code

Measuring Execution Time

We will be using the Python module module time.

import time

def time_execution(code):
    start = time.clock()
    result = eval(code)
    run_time = time.clock() - start
    return result, run_time

# for instance, on my computer I get
# >>> time_execution('2+2')
# (4, 8.623211838437968e-05)

# the first element 4 is the result of the code '2+2' and
# the second element 8.623211838437968e-05 it takes to execute
# this code

Building a Big Fake Index to Measure the Speed of lookup

def add_to_index(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword,[url]])

def lookup(index,keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return []

def make_string(p):
    s = ""
    for e in p:
        s = s + e
    return s

# building a fake index 
def make_big_index(size):
    index = []
    letters = ['a','a','a','a','a','a','a','a']
    while len(index) < size:
        word = make_string(letters)
        add_to_index(index, word, 'fake')
        for i in range(len(letters) - 1, 0, -1):
            if letters[i] < 'z':
                letters[i] = chr(ord(letters[i]) + 1)
                break
            else:
                letters[i] = 'a'
    return index

index10k = make_big_index(10000)

print("first element of our fake index: ", index10k[0])
print("last element of our fake index: ", index10k[-1])

# timing lookup

print(time_execution("lookup(index10k,'aaaaaaaa')"))
print(time_execution("lookup(index10k,'zzzzzzzz')"))