From 13eddea2cbf683a823c6b39dce0c2fff6e3c745e Mon Sep 17 00:00:00 2001 From: tbarillot Date: Thu, 26 Aug 2021 12:56:53 +0200 Subject: [PATCH] First commit --- example.com.json | 7 +++ script.py | 147 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 example.com.json create mode 100644 script.py diff --git a/example.com.json b/example.com.json new file mode 100644 index 0000000..74a0a01 --- /dev/null +++ b/example.com.json @@ -0,0 +1,7 @@ +{ + "startingPage": "https://example.com", + "domain": "example.com", + "excludes": [ + "example.com/pages/to/exclude" + ] +} \ No newline at end of file diff --git a/script.py b/script.py new file mode 100644 index 0000000..9f0b0d7 --- /dev/null +++ b/script.py @@ -0,0 +1,147 @@ +import requests +import bs4 +import time +import json +import sys +import os +from urllib.parse import urlparse + +mediaMimes = ['jpg', 'webp', 'png', 'mp3', 'mp4', 'gif', 'jpeg', 'jfif', 'pdf', 'rar', 'zip', 'exe'] + +def excluded(link, exclusionRules): + for substring in exclusionRules: + if substring in link: + return True + return False + +def getCode(link): + try: + return requests.head(link, timeout=10).status_code + except: + return 'ERROR' + +def getYoutubeCode(link): + if 'youtube.com/embed/' in link: + pos = link.find('embed/') + len('embed/') + yId = link[pos:pos + 11] + elif 'youtube.com/watch?v=' in link: + pos = link.find('watch?v=') + len('watch?v=') + yId = link[pos:pos + 11] + elif 'youtu.be/' in link: + pos = link.find('youtu.be/') + len('youtu.be/') + yId = link[pos:pos + 11] + else: + return getCode(link) + return getCode('https://img.youtube.com/vi/' + yId + '/mqdefault.jpg') + +def request(page): + req = requests.get(page) + res = req.content + + soup = bs4.BeautifulSoup(res,'html5lib') + anchor_tags = soup.find_all("a") + links = [tag.get('href') for tag in anchor_tags] + + anchor_tags = soup.find_all("img") + links += [tag.get('src') for tag in anchor_tags] + + anchor_tags = soup.find_all("iframe") + links += [tag.get('src') for tag in anchor_tags] + + anchor_tags = soup.find_all("source") + links += [tag.get('src') for tag in anchor_tags] + + results = [] + urlParse = urlparse(page) + for e in links: + if e: + if e[0] == '#': + pass + elif e[:2] == '//': + results += [urlParse.scheme + ':' + e] + elif e[0] == '/': + results += [urlParse.scheme + '://' + urlParse.netloc + e] + elif e[:7] == 'http://': + results += [e] + elif e[:8] == 'https://': + results += [e] + elif e[:3] == '../': + parentFolder = '/'.join(page.split('/')[:-1]) + '/' + results += [parentFolder + e[3:]] + else: + parentFolder = '/'.join(page.split('/')[:-1]) + '/' + results += [parentFolder + e] + + return results + + + + +configPath = sys.argv[1] + + + +with open(configPath, 'r') as f: + data = json.load(f) + currentPage = data['startingPage'] + domain = data['domain'] + exclusionRules = data['excludes'] + +pageToCrawl = [currentPage] +visitedPages = [] +outboundPages = [] +medias = [] + +visitedPagesCSV = [] +outboundPagesCSV = [] +mediasCSV = [] + +while pageToCrawl: + currentPage = pageToCrawl[0] + print('Current page:', currentPage) + linksInPage = request(currentPage) + + # Apply exclusionRules + linksToKeep = [] + for link in linksInPage: + if not excluded(link, exclusionRules): + linksToKeep += [link] + linksInPage = linksToKeep + + for link in linksInPage: + if domain not in link: + if link not in outboundPages: + #print('Outbound found:', link) + outboundPages += [link] + if 'youtube.com' in link: + outboundPagesCSV += [[link, currentPage, str(getYoutubeCode(link))]] + else: + outboundPagesCSV += [[link, currentPage, str(getCode(link))]] + elif link.split('.')[-1] in mediaMimes or ('data:' in link and 'base64' in link): + if link not in medias: + #print('Media found:', link) + medias += [link] + mediasCSV += [[link, currentPage, str(getCode(link))]] + elif link not in pageToCrawl: + if link not in visitedPages: + pageToCrawl += [link] + + pageToCrawl.remove(currentPage) + visitedPages += [currentPage] + visitedPagesCSV += [[currentPage, str(getCode(currentPage))]] + +outputFolder = configPath[:-5] + '/' +os.mkdir(outputFolder) + +with open(outputFolder + "visitedPages.csv", "w") as f: + for link in visitedPagesCSV: + f.write('"' + '","'.join(link) + '"' + '\n') + +with open(outputFolder + "outboundPages.csv", "w") as f: + for link in outboundPagesCSV: + f.write('"' + '","'.join(link) + '"' + '\n') + +with open(outputFolder + "medias.csv", "w") as f: + for link in mediasCSV: + f.write(','.join(link) + '\n') +