pythonFDP/code/links.py at master · kgisl/pythonFDP

70 lines (53 loc) · 2.1 KB

import requests
from bs4 import BeautifulSoup
def getOutBoundURLs(anchors):
    '''http://www.w3.org/TR/html5/text-level-semantics.html#the-a-element
    anchors contains all \<a\> tag elements from the HTML content.
    Iterate through the list of anchors and build list containing
    the href addresses whenever it is available.
    @author kgashok
    addressList = []
    for anchor in anchors:
        try:
            linkaddress = anchor.attrs['href']
            addressList.append(linkaddress)
        except BaseException:
            print("Ignoring", anchor)
    return addressList
def getOutBoundHttpURLs(alist):
    return [address for address in alist if address.startswith("https:")]
def generateBanner(url, anchorCount, outBoundCount, linkCount):
    return '''
#############################
---- Outbound links for {0} ({1}, {2}, https: {3})
#############################
'''.format(url, str(anchorCount), str(outBoundCount), str(linkCount))
def printURLs(url, anchors, f=None):
    Print only those addresses that start with 'https' from valid anchors
    if 'f'ilename is valid, write extracted URLs to file as well
    outBoundURLs = getOutBoundURLs(anchors)
    hhrefs = getOutBoundHttpURLs(outBoundURLs)
    banner = generateBanner(url, len(anchors), len(outBoundURLs), len(hhrefs))
    print(banner)
        f.write(banner)
    for linkaddress in hhrefs:
        print(linkaddress)
        if f:
            f.write(url + " -> " + linkaddress + "\n")
urls = ["http://www.python.org", "http://www.facebook.com", "http://www.kct.ac.in", "http://www.psgtech.edu", "http://www.kgkite.ac.in"        # , "http://www.kghospital.com"
        # , "http://www.kgisl.com"
        , "http://www.sece.ac.in", "http://www.siet.ac.in", "http://www.bennett.edu.in"
dataFile = open("data.txt", "w")
for url in urls:
    html = requests.get(url)
    soup = BeautifulSoup(html.text)
    anchors = soup.find_all("a")
    # equivalent one-liner
    # anchors = BeautifulSoup(requests.get("http://www.python.org").text).find_all('a')
    printURLs(url, anchors, dataFile)
dataFile.close()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

links.py

Latest commit

History

links.py

File metadata and controls