From 13eddea2cbf683a823c6b39dce0c2fff6e3c745e Mon Sep 17 00:00:00 2001
From: tbarillot <thomas.barillot@etu.u-bordeaux.fr>
Date: Thu, 26 Aug 2021 12:56:53 +0200
Subject: [PATCH] First commit

---
 example.com.json |   7 +++
 script.py        | 147 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 example.com.json
 create mode 100644 script.py

diff --git a/example.com.json b/example.com.json
new file mode 100644
index 0000000..74a0a01
--- /dev/null
+++ b/example.com.json
@@ -0,0 +1,7 @@
+{
+  "startingPage": "https://example.com",
+  "domain": "example.com",
+  "excludes": [
+    "example.com/pages/to/exclude"
+  ]
+}
\ No newline at end of file
diff --git a/script.py b/script.py
new file mode 100644
index 0000000..9f0b0d7
--- /dev/null
+++ b/script.py
@@ -0,0 +1,147 @@
+import requests 
+import bs4
+import time
+import json
+import sys
+import os
+from urllib.parse import urlparse
+
+mediaMimes = ['jpg', 'webp', 'png', 'mp3', 'mp4', 'gif', 'jpeg', 'jfif', 'pdf', 'rar', 'zip', 'exe']
+
+def excluded(link, exclusionRules):
+    for substring in exclusionRules:
+        if substring in link:
+            return True
+    return False
+
+def getCode(link):
+    try:
+        return requests.head(link, timeout=10).status_code
+    except:
+        return 'ERROR'
+
+def getYoutubeCode(link):
+    if 'youtube.com/embed/' in link:
+        pos = link.find('embed/') + len('embed/')
+        yId = link[pos:pos + 11]
+    elif 'youtube.com/watch?v=' in link:
+        pos = link.find('watch?v=') + len('watch?v=')
+        yId = link[pos:pos + 11]
+    elif 'youtu.be/' in link:
+        pos = link.find('youtu.be/') + len('youtu.be/')
+        yId = link[pos:pos + 11]
+    else:
+        return getCode(link)
+    return getCode('https://img.youtube.com/vi/' + yId + '/mqdefault.jpg')
+
+def request(page):
+    req = requests.get(page)
+    res = req.content 
+     
+    soup = bs4.BeautifulSoup(res,'html5lib') 
+    anchor_tags = soup.find_all("a") 
+    links = [tag.get('href') for tag in anchor_tags]
+
+    anchor_tags = soup.find_all("img") 
+    links += [tag.get('src') for tag in anchor_tags]
+
+    anchor_tags = soup.find_all("iframe") 
+    links += [tag.get('src') for tag in anchor_tags]
+
+    anchor_tags = soup.find_all("source") 
+    links += [tag.get('src') for tag in anchor_tags]
+
+    results = []
+    urlParse = urlparse(page)
+    for e in links:
+        if e:
+            if e[0] == '#':
+                pass
+            elif e[:2] == '//':
+                results += [urlParse.scheme + ':' + e]
+            elif e[0] == '/':
+                results += [urlParse.scheme + '://' + urlParse.netloc + e]
+            elif e[:7] == 'http://':
+                results += [e]
+            elif e[:8] == 'https://':
+                results += [e]
+            elif e[:3] == '../':
+                parentFolder = '/'.join(page.split('/')[:-1]) + '/'
+                results += [parentFolder + e[3:]]
+            else:
+                parentFolder = '/'.join(page.split('/')[:-1]) + '/'
+                results += [parentFolder + e]
+                
+    return results
+
+                
+
+
+configPath = sys.argv[1]
+
+
+
+with open(configPath, 'r') as f:
+    data = json.load(f)
+    currentPage = data['startingPage']
+    domain = data['domain']
+    exclusionRules = data['excludes']
+
+pageToCrawl = [currentPage]
+visitedPages = []
+outboundPages = []
+medias = []
+
+visitedPagesCSV = []
+outboundPagesCSV = []
+mediasCSV = []
+
+while pageToCrawl:
+    currentPage = pageToCrawl[0]
+    print('Current page:', currentPage)
+    linksInPage = request(currentPage)
+
+    # Apply exclusionRules
+    linksToKeep = []
+    for link in linksInPage:
+        if not excluded(link, exclusionRules):
+            linksToKeep += [link]
+    linksInPage = linksToKeep
+
+    for link in linksInPage:
+        if domain not in link:
+            if link not in outboundPages:
+                #print('Outbound found:', link)
+                outboundPages += [link]
+                if 'youtube.com' in link:
+                    outboundPagesCSV += [[link, currentPage, str(getYoutubeCode(link))]]
+                else:
+                    outboundPagesCSV += [[link, currentPage, str(getCode(link))]]
+        elif link.split('.')[-1] in mediaMimes or ('data:' in link and 'base64' in link):
+            if link not in medias:
+                #print('Media found:', link)
+                medias += [link]
+                mediasCSV += [[link, currentPage, str(getCode(link))]]
+        elif link not in pageToCrawl:
+            if link not in visitedPages:
+                pageToCrawl += [link]
+            
+    pageToCrawl.remove(currentPage)
+    visitedPages += [currentPage]
+    visitedPagesCSV += [[currentPage, str(getCode(currentPage))]]
+
+outputFolder = configPath[:-5] + '/'
+os.mkdir(outputFolder)
+
+with open(outputFolder + "visitedPages.csv", "w") as f:
+    for link in visitedPagesCSV:
+        f.write('"' + '","'.join(link) + '"' + '\n')
+
+with open(outputFolder + "outboundPages.csv", "w") as f:
+    for link in outboundPagesCSV:
+        f.write('"' + '","'.join(link) + '"' + '\n')
+
+with open(outputFolder + "medias.csv", "w") as f:
+    for link in mediasCSV:
+        f.write(','.join(link) + '\n')
+