summaryrefslogtreecommitdiff
path: root/scrap/parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'scrap/parser.py')
-rw-r--r--scrap/parser.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/scrap/parser.py b/scrap/parser.py
new file mode 100644
index 0000000..618f642
--- /dev/null
+++ b/scrap/parser.py
@@ -0,0 +1,83 @@
+import json
+import re
+import structures
+import os
+
+
+def process_json_file(name, artist_id):
+ link = getLink(name)
+
+ file_json = open(link, "r")
+ album_json = file_json.read()
+ file_json.close()
+
+ return process_json(album_json, artist_id)
+
+
+def getLink(name):
+ return "Lyrics_"+name.replace(" ", "")+".json"
+
+
+def process_json(album_json, artist_id):
+ data = json.loads(album_json)
+ name = data["name"].lower()
+
+ artist_name_api = data["artist"]["name"].lower()
+ artist_name = artist_name_api.replace(" ", "")
+ off_cover = data["cover_art_thumbnail_url"]
+
+ new_cover = get_cover_link(artist_name, name)
+ download_cover(off_cover, new_cover, artist_name)
+
+ songs = [analyze_song(song) for song in data["tracks"]]
+ return structures.album(name, new_cover, songs, artist_id)
+
+
+def get_cover_link(artist_name, album_name):
+ album_name = album_name.replace(" ", "")
+ location = f"covers/{artist_name}/{album_name}.png"
+
+ return location
+
+
+def download_cover(off_cover, new_cover, artist_name):
+ if not os.path.isfile("covers"):
+ os.system("mkdir covers")
+
+ if not os.path.isfile(f"covers/{artist_name}"):
+ os.system(f"mkdir 'covers/{artist_name}'")
+
+ if not os.path.isfile(new_cover):
+ os.system(f"wget {off_cover} -O '{new_cover}'")
+
+
+def analyze_song(song):
+ name = song["song"]["title"]
+ name = name.lower()
+
+ lyrics = song["song"]["lyrics"]
+ lyrics = format_lyrics(lyrics)
+
+ return structures.song(name, lyrics)
+
+
+def format_lyrics(lyrics):
+ if lyrics != "":
+ lyrics = lyrics.split("Lyrics")[1].lstrip()
+ lyrics = lyrics.split("Embed")[0].rstrip()
+ lyrics = lyrics.replace("You might also like", "")
+ lyrics = re.sub(r'See (.*?) LiveGet', 'liveget', lyrics)
+ lyrics = lyrics.split("liveget")[0].rstrip()
+
+ lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
+ lyrics = re.sub("\n{3,}", "\n\n", lyrics)
+
+ lyrics = lyrics.replace("\u0435", "e")
+ lyrics = lyrics.replace("\u2005", " ")
+
+ while lyrics[0] == '\n':
+ lyrics = lyrics[1:]
+
+ lyrics = lyrics.lower()
+
+ return lyrics