# L-24 MCS 507 Mon 16 Oct 2023 : webcrawler.py
"""
Prompts the user for a URL and the maximal
depth of the recursion tree.
Lists all locations of web servers that can
reached starting from the user given URL.
"""
from scanhttplinks import httplinks

def new_locations(links, visited):
    """
    Given the list links of new URLs and the
    list of already visited locations,
    returns the list of new locations,
    locations not yet visited earlier.
    """
    from urllib.parse import urlparse
    result = []
    for url in links:
        parsed = urlparse(url)
        loc = parsed[1]
        if loc not in visited:
            if loc not in visited:
                result.append(loc)
    return result

def crawler(url, k, visited):
    """
    Returns the list visited updated with the
    list of locations reachable from the
    given url using at most k steps.
    """
    from urllib.parse import urlunparse
    links = httplinks(url)
    newlinks = new_locations(links, visited)
    result = visited + newlinks
    if k == 0:
        return result
    else:
        for loc in newlinks:
            url = urlunparse(('http', loc, '', '', '', ''))
            result = crawler(url, k-1, result)
        return result

def main():
    """
    Prompts the user for a web page,
    and prints all URLs this page refers to.
    """
    print('crawling the web ...')
    page = input('Give URL : ')
    depth = int(input('give maximal depth : '))
    locations = crawler(page, depth, [])
    print('reachable locations :', locations)
    print('total #locations :', len(locations))

if __name__ == "__main__":
    main()
