First commit
This commit is contained in:
parent
6f43e7eb1c
commit
13eddea2cb
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"startingPage": "https://example.com",
|
||||
"domain": "example.com",
|
||||
"excludes": [
|
||||
"example.com/pages/to/exclude"
|
||||
]
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
import requests
|
||||
import bs4
|
||||
import time
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
mediaMimes = ['jpg', 'webp', 'png', 'mp3', 'mp4', 'gif', 'jpeg', 'jfif', 'pdf', 'rar', 'zip', 'exe']
|
||||
|
||||
def excluded(link, exclusionRules):
|
||||
for substring in exclusionRules:
|
||||
if substring in link:
|
||||
return True
|
||||
return False
|
||||
|
||||
def getCode(link):
|
||||
try:
|
||||
return requests.head(link, timeout=10).status_code
|
||||
except:
|
||||
return 'ERROR'
|
||||
|
||||
def getYoutubeCode(link):
|
||||
if 'youtube.com/embed/' in link:
|
||||
pos = link.find('embed/') + len('embed/')
|
||||
yId = link[pos:pos + 11]
|
||||
elif 'youtube.com/watch?v=' in link:
|
||||
pos = link.find('watch?v=') + len('watch?v=')
|
||||
yId = link[pos:pos + 11]
|
||||
elif 'youtu.be/' in link:
|
||||
pos = link.find('youtu.be/') + len('youtu.be/')
|
||||
yId = link[pos:pos + 11]
|
||||
else:
|
||||
return getCode(link)
|
||||
return getCode('https://img.youtube.com/vi/' + yId + '/mqdefault.jpg')
|
||||
|
||||
def request(page):
|
||||
req = requests.get(page)
|
||||
res = req.content
|
||||
|
||||
soup = bs4.BeautifulSoup(res,'html5lib')
|
||||
anchor_tags = soup.find_all("a")
|
||||
links = [tag.get('href') for tag in anchor_tags]
|
||||
|
||||
anchor_tags = soup.find_all("img")
|
||||
links += [tag.get('src') for tag in anchor_tags]
|
||||
|
||||
anchor_tags = soup.find_all("iframe")
|
||||
links += [tag.get('src') for tag in anchor_tags]
|
||||
|
||||
anchor_tags = soup.find_all("source")
|
||||
links += [tag.get('src') for tag in anchor_tags]
|
||||
|
||||
results = []
|
||||
urlParse = urlparse(page)
|
||||
for e in links:
|
||||
if e:
|
||||
if e[0] == '#':
|
||||
pass
|
||||
elif e[:2] == '//':
|
||||
results += [urlParse.scheme + ':' + e]
|
||||
elif e[0] == '/':
|
||||
results += [urlParse.scheme + '://' + urlParse.netloc + e]
|
||||
elif e[:7] == 'http://':
|
||||
results += [e]
|
||||
elif e[:8] == 'https://':
|
||||
results += [e]
|
||||
elif e[:3] == '../':
|
||||
parentFolder = '/'.join(page.split('/')[:-1]) + '/'
|
||||
results += [parentFolder + e[3:]]
|
||||
else:
|
||||
parentFolder = '/'.join(page.split('/')[:-1]) + '/'
|
||||
results += [parentFolder + e]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
|
||||
configPath = sys.argv[1]
|
||||
|
||||
|
||||
|
||||
with open(configPath, 'r') as f:
|
||||
data = json.load(f)
|
||||
currentPage = data['startingPage']
|
||||
domain = data['domain']
|
||||
exclusionRules = data['excludes']
|
||||
|
||||
pageToCrawl = [currentPage]
|
||||
visitedPages = []
|
||||
outboundPages = []
|
||||
medias = []
|
||||
|
||||
visitedPagesCSV = []
|
||||
outboundPagesCSV = []
|
||||
mediasCSV = []
|
||||
|
||||
while pageToCrawl:
|
||||
currentPage = pageToCrawl[0]
|
||||
print('Current page:', currentPage)
|
||||
linksInPage = request(currentPage)
|
||||
|
||||
# Apply exclusionRules
|
||||
linksToKeep = []
|
||||
for link in linksInPage:
|
||||
if not excluded(link, exclusionRules):
|
||||
linksToKeep += [link]
|
||||
linksInPage = linksToKeep
|
||||
|
||||
for link in linksInPage:
|
||||
if domain not in link:
|
||||
if link not in outboundPages:
|
||||
#print('Outbound found:', link)
|
||||
outboundPages += [link]
|
||||
if 'youtube.com' in link:
|
||||
outboundPagesCSV += [[link, currentPage, str(getYoutubeCode(link))]]
|
||||
else:
|
||||
outboundPagesCSV += [[link, currentPage, str(getCode(link))]]
|
||||
elif link.split('.')[-1] in mediaMimes or ('data:' in link and 'base64' in link):
|
||||
if link not in medias:
|
||||
#print('Media found:', link)
|
||||
medias += [link]
|
||||
mediasCSV += [[link, currentPage, str(getCode(link))]]
|
||||
elif link not in pageToCrawl:
|
||||
if link not in visitedPages:
|
||||
pageToCrawl += [link]
|
||||
|
||||
pageToCrawl.remove(currentPage)
|
||||
visitedPages += [currentPage]
|
||||
visitedPagesCSV += [[currentPage, str(getCode(currentPage))]]
|
||||
|
||||
outputFolder = configPath[:-5] + '/'
|
||||
os.mkdir(outputFolder)
|
||||
|
||||
with open(outputFolder + "visitedPages.csv", "w") as f:
|
||||
for link in visitedPagesCSV:
|
||||
f.write('"' + '","'.join(link) + '"' + '\n')
|
||||
|
||||
with open(outputFolder + "outboundPages.csv", "w") as f:
|
||||
for link in outboundPagesCSV:
|
||||
f.write('"' + '","'.join(link) + '"' + '\n')
|
||||
|
||||
with open(outputFolder + "medias.csv", "w") as f:
|
||||
for link in mediasCSV:
|
||||
f.write(','.join(link) + '\n')
|
||||
|
Loading…
Reference in New Issue