import requests import bs4 import time import json import sys import os from urllib.parse import urlparse mediaMimes = ['jpg', 'webp', 'png', 'mp3', 'mp4', 'gif', 'jpeg', 'jfif', 'pdf', 'rar', 'zip', 'exe'] def excluded(link, exclusionRules): for substring in exclusionRules: if substring in link: return True return False def getCode(link): try: return requests.head(link, timeout=10).status_code except: return 'ERROR' def getYoutubeCode(link): if 'youtube.com/embed/' in link: pos = link.find('embed/') + len('embed/') yId = link[pos:pos + 11] elif 'youtube.com/watch?v=' in link: pos = link.find('watch?v=') + len('watch?v=') yId = link[pos:pos + 11] elif 'youtu.be/' in link: pos = link.find('youtu.be/') + len('youtu.be/') yId = link[pos:pos + 11] else: return getCode(link) return getCode('https://img.youtube.com/vi/' + yId + '/mqdefault.jpg') def request(page): req = requests.get(page) res = req.content soup = bs4.BeautifulSoup(res,'html5lib') anchor_tags = soup.find_all("a") links = [tag.get('href') for tag in anchor_tags] anchor_tags = soup.find_all("img") links += [tag.get('src') for tag in anchor_tags] anchor_tags = soup.find_all("iframe") links += [tag.get('src') for tag in anchor_tags] anchor_tags = soup.find_all("source") links += [tag.get('src') for tag in anchor_tags] results = [] urlParse = urlparse(page) for e in links: if e: if e[0] == '#': pass elif e[:2] == '//': results += [urlParse.scheme + ':' + e] elif e[0] == '/': results += [urlParse.scheme + '://' + urlParse.netloc + e] elif e[:7] == 'http://': results += [e] elif e[:8] == 'https://': results += [e] elif e[:3] == '../': parentFolder = '/'.join(page.split('/')[:-1]) + '/' results += [parentFolder + e[3:]] else: parentFolder = '/'.join(page.split('/')[:-1]) + '/' results += [parentFolder + e] return results configPath = sys.argv[1] with open(configPath, 'r') as f: data = json.load(f) currentPage = data['startingPage'] domain = data['domain'] exclusionRules = data['excludes'] pageToCrawl = [currentPage] visitedPages = [] outboundPages = [] medias = [] visitedPagesCSV = [] outboundPagesCSV = [] mediasCSV = [] while pageToCrawl: currentPage = pageToCrawl[0] print('Current page:', currentPage) linksInPage = request(currentPage) # Apply exclusionRules linksToKeep = [] for link in linksInPage: if not excluded(link, exclusionRules): linksToKeep += [link] linksInPage = linksToKeep for link in linksInPage: if domain not in link: if link not in outboundPages: #print('Outbound found:', link) outboundPages += [link] if 'youtube.com' in link: outboundPagesCSV += [[link, currentPage, str(getYoutubeCode(link))]] else: outboundPagesCSV += [[link, currentPage, str(getCode(link))]] elif link.split('.')[-1] in mediaMimes or ('data:' in link and 'base64' in link): if link not in medias: #print('Media found:', link) medias += [link] mediasCSV += [[link, currentPage, str(getCode(link))]] elif link not in pageToCrawl: if link not in visitedPages: pageToCrawl += [link] pageToCrawl.remove(currentPage) visitedPages += [currentPage] visitedPagesCSV += [[currentPage, str(getCode(currentPage))]] outputFolder = configPath[:-5] + '/' os.mkdir(outputFolder) with open(outputFolder + "visitedPages.csv", "w") as f: for link in visitedPagesCSV: f.write('"' + '","'.join(link) + '"' + '\n') with open(outputFolder + "outboundPages.csv", "w") as f: for link in outboundPagesCSV: f.write('"' + '","'.join(link) + '"' + '\n') with open(outputFolder + "medias.csv", "w") as f: for link in mediasCSV: f.write(','.join(link) + '\n')