From 2e8ddbf422a3a6d75b1e1f73cad089111f801c05 Mon Sep 17 00:00:00 2001 From: niliara-edu Date: Tue, 24 Sep 2024 12:21:40 +0200 Subject: finished web scrapper --- scrap/.gitignore | 3 ++ scrap/Lyrics_apathy.json | 2 +- scrap/albums.py | 49 ++++++++++++++++++++ scrap/api.py | 15 ++++++ scrap/database.py | 117 +++++++++++++++++++++++++++++++++++++++++++++++ scrap/main.py | 53 ++++++++++++++++----- scrap/parser.py | 83 +++++++++++++++++++++++++++++++++ scrap/scrap.py | 25 ---------- scrap/structures.py | 12 +++++ 9 files changed, 322 insertions(+), 37 deletions(-) create mode 100644 scrap/.gitignore create mode 100644 scrap/albums.py create mode 100644 scrap/api.py create mode 100644 scrap/database.py create mode 100644 scrap/parser.py delete mode 100644 scrap/scrap.py create mode 100644 scrap/structures.py (limited to 'scrap') diff --git a/scrap/.gitignore b/scrap/.gitignore new file mode 100644 index 0000000..6922c3e --- /dev/null +++ b/scrap/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.json +covers/ diff --git a/scrap/Lyrics_apathy.json b/scrap/Lyrics_apathy.json index 0272d47..29c06ee 100644 --- a/scrap/Lyrics_apathy.json +++ b/scrap/Lyrics_apathy.json @@ -860,4 +860,4 @@ } } ] -} +} \ No newline at end of file diff --git a/scrap/albums.py b/scrap/albums.py new file mode 100644 index 0000000..791b962 --- /dev/null +++ b/scrap/albums.py @@ -0,0 +1,49 @@ +album_data = [ + ["astrophysics", [ + "apathy", + "The Unending Need For Perpetual Motion", + "HOPE LEFT ME Complete Version", + "Selected Tragic", + "Cute Tragedies", + "selected apathetic", + "ENTITY", + ]], + ["sadsvit", [ + "Неонова Мрія Neon Dream", + "20 21", + "Cassette", + "Суматоха Bustle", + ]], + ["molchat doma", [ + "Белая Полоса White Stripe", + "Монумент Monument", + "Этажи Floors", + "С крыш наших домов From our houses rooftops", + ]], + ["amesoeurs", [ + "Amesoeurs", + ]], + ["severoth", [ + "Vsesvit", + "Winterfall", + ]], + ["sadness", [ + "Circle of Veins", + ]], +] + +# ["astrophysics", "apathy"], +# ["astrophysics", "The Unending Need For Perpetual Motion"], +# ["astrophysics", "HOPE LEFT ME Complete Version"], +# ["astrophysics", "Selected Tragic"], +# ["astrophysics", "Cute Tragedies"], +# ["astrophysics", "selected apathetic"], +# ["astrophysics", "ENTITY"], +# ["sadsvit", "Неонова Мрія Neon Dream"], +# ["sadsvit", "20 21"], +# ["sadsvit", "Casette"], +# ["sadsvit", "Суматоха Bustle"], +# ["molchat doma", "Белая Полоса White Stripe"], +# ["molchat doma", "Монумент Monument"], +# ["molchat doma", "Этажи Floors"], +# ["molchat doma", "С крыш наших домов From our houses’ rooftops"], diff --git a/scrap/api.py b/scrap/api.py new file mode 100644 index 0000000..af83bb7 --- /dev/null +++ b/scrap/api.py @@ -0,0 +1,15 @@ +import lyricsgenius + + +genius = lyricsgenius.Genius( + "0uSA9UFGsiO2WozVmbWPhyhOoVmUNuM3PXRt9rvWhptHBMgSO5CZBxGUMkwet5mv" +) + + +def download_albums(albums): + [get_album_json(artist[0], album) for artist in albums for album in artist[1]] + + +def get_album_json(artist_name, album_name): + album = genius.search_album(album_name, artist_name) + return album.save_lyrics() diff --git a/scrap/database.py b/scrap/database.py new file mode 100644 index 0000000..3934361 --- /dev/null +++ b/scrap/database.py @@ -0,0 +1,117 @@ +import mysql.connector + +connector = mysql.connector.connect( + host="localhost", + user="root", +) + +cursor = connector.cursor() + + +def setup(): + cursor.execute("CREATE DATABASE IF NOT EXISTS balalaika;") + cursor.execute("USE balalaika;") + cursor.execute("DROP TABLE IF EXISTS song;") + cursor.execute("DROP TABLE IF EXISTS album;") + cursor.execute("DROP TABLE IF EXISTS artist;") + + cursor.execute(""" + CREATE TABLE artist ( + id int NOT NULL AUTO_INCREMENT, + name varchar(255), + + PRIMARY KEY (id) + ); + """) + + cursor.execute(""" + CREATE TABLE album ( + id int NOT NULL AUTO_INCREMENT, + name varchar(255), + cover varchar(510), + artist_id int, + + PRIMARY KEY (id), + FOREIGN KEY (artist_id) REFERENCES artist(id) + ); + """) + + cursor.execute(""" + CREATE TABLE song ( + id int NOT NULL AUTO_INCREMENT, + name varchar(255), + lyrics TEXT, + + album_id int, + + PRIMARY KEY (id), + FOREIGN KEY (album_id) REFERENCES album(id) + ); + """) + + cursor.execute("ALTER TABLE song CONVERT TO CHARACTER SET utf8") + cursor.execute("ALTER TABLE album CONVERT TO CHARACTER SET utf8") + cursor.execute("ALTER TABLE artist CONVERT TO CHARACTER SET utf8") + + +def process_albums(album_list): + [process_album(album, album_id) + for album_id, album in enumerate(album_list)] + + +def process_album(album, album_id): + upload_album(album) + [upload_song(song, album_id+1) for song in album.songs] + + +def upload_album(album): + cursor.execute(""" + INSERT INTO album ( + name, cover, artist_id + ) + VALUES ( + %(name)s, %(cover)s, %(artist_id)s + ); + """, { + 'name': album.name, + 'cover': album.cover, + 'artist_id': album.artist + }) + + +def upload_song(song, album_id): + cursor.execute(""" + INSERT INTO song ( + name, lyrics, album_id + ) + VALUES ( + %(name)s, %(lyrics)s, %(album_id)s + ) + """, { + 'name': song.name, + 'lyrics': song.lyrics, + 'album_id': album_id + }) + + +def process_artists(artist_names): + [process_artist(artist) for artist in artist_names] + + +def process_artist(artist): + cursor.execute(""" + INSERT INTO artist ( + name + ) + VALUES ( + %(name)s + ) + """, { + 'name': artist, + }) + + +def close(): + cursor.close() + connector.commit() + connector.close() diff --git a/scrap/main.py b/scrap/main.py index 80f4d24..72a9929 100644 --- a/scrap/main.py +++ b/scrap/main.py @@ -1,18 +1,49 @@ -import lyricsgenius -import sys +from albums import album_data +import api +import parser +import database -def getAlbum(artist_name, album_name): - genius = lyricsgenius.Genius( - "0uSA9UFGsiO2WozVmbWPhyhOoVmUNuM3PXRt9rvWhptHBMgSO5CZBxGUMkwet5mv" +def start(): + print("do you want to download the json data from genius? [y, n] ", end="") + if input().lower() == "y": + api.download_albums(album_data) + print("download finished!") + + database.setup() + upload_albums( + get_album_data(), + get_artist_names() ) - album = genius.search_album(artist_name, album_name) - album.save_lyrics() + database.close() + print("upload finished!") + print("remember to move the covers directory once you're done!") + + +def upload_albums(album_data, artist_names): + database.process_artists(artist_names) + database.process_albums(album_data) + + +def get_album_data(): + album_data = [] + artist_id = 0 + + for artist in album_data: + artist_id += 1 + + for album in artist[1]: + album_data.append(parser.process_json_file(album, artist_id)) + + return album_data -def getJsonLyrics(link): - for key, value in data.items(): - print(key, value) +def get_artist_names(): + artist_data = [] + [artist_data.append( + artist[0] + ) for artist in album_data] + return artist_data -getJsonLyrics(sys.argv[1]) +start() diff --git a/scrap/parser.py b/scrap/parser.py new file mode 100644 index 0000000..618f642 --- /dev/null +++ b/scrap/parser.py @@ -0,0 +1,83 @@ +import json +import re +import structures +import os + + +def process_json_file(name, artist_id): + link = getLink(name) + + file_json = open(link, "r") + album_json = file_json.read() + file_json.close() + + return process_json(album_json, artist_id) + + +def getLink(name): + return "Lyrics_"+name.replace(" ", "")+".json" + + +def process_json(album_json, artist_id): + data = json.loads(album_json) + name = data["name"].lower() + + artist_name_api = data["artist"]["name"].lower() + artist_name = artist_name_api.replace(" ", "") + off_cover = data["cover_art_thumbnail_url"] + + new_cover = get_cover_link(artist_name, name) + download_cover(off_cover, new_cover, artist_name) + + songs = [analyze_song(song) for song in data["tracks"]] + return structures.album(name, new_cover, songs, artist_id) + + +def get_cover_link(artist_name, album_name): + album_name = album_name.replace(" ", "") + location = f"covers/{artist_name}/{album_name}.png" + + return location + + +def download_cover(off_cover, new_cover, artist_name): + if not os.path.isfile("covers"): + os.system("mkdir covers") + + if not os.path.isfile(f"covers/{artist_name}"): + os.system(f"mkdir 'covers/{artist_name}'") + + if not os.path.isfile(new_cover): + os.system(f"wget {off_cover} -O '{new_cover}'") + + +def analyze_song(song): + name = song["song"]["title"] + name = name.lower() + + lyrics = song["song"]["lyrics"] + lyrics = format_lyrics(lyrics) + + return structures.song(name, lyrics) + + +def format_lyrics(lyrics): + if lyrics != "": + lyrics = lyrics.split("Lyrics")[1].lstrip() + lyrics = lyrics.split("Embed")[0].rstrip() + lyrics = lyrics.replace("You might also like", "") + lyrics = re.sub(r'See (.*?) LiveGet', 'liveget', lyrics) + lyrics = lyrics.split("liveget")[0].rstrip() + + lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics) + lyrics = re.sub("\n{3,}", "\n\n", lyrics) + + lyrics = lyrics.replace("\u0435", "e") + lyrics = lyrics.replace("\u2005", " ") + + while lyrics[0] == '\n': + lyrics = lyrics[1:] + + lyrics = lyrics.lower() + + return lyrics diff --git a/scrap/scrap.py b/scrap/scrap.py deleted file mode 100644 index d979f49..0000000 --- a/scrap/scrap.py +++ /dev/null @@ -1,25 +0,0 @@ -from bs4 import BeautifulSoup -import re -# import os -import sys -import requests - - -def scrape_song_lyrics(url): - page = requests.get(url) - html = BeautifulSoup(page.text, 'html.parser') - lyrics = str(html.find( - 'div', - # class_='lyrics-root-pin-spacer', - class_='Lyrics__Container-sc-1ynbvzw-1 kUgSbL' - )) - # remove identifiers like chorus, verse, etc - lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics) - lyrics = re.sub(r'<.*?>', '\n', lyrics) - # remove empty lines - # lyrics = os.linesep.join([s for s in lyrics.splitlines() if s]) - lyrics = "\n".join([s for s in lyrics.split("\n") if s]) - return lyrics - - -print(scrape_song_lyrics(sys.argv[1])) diff --git a/scrap/structures.py b/scrap/structures.py new file mode 100644 index 0000000..b139489 --- /dev/null +++ b/scrap/structures.py @@ -0,0 +1,12 @@ +class song: + def __init__(self, name, lyrics): + self.name = name + self.lyrics = lyrics + + +class album: + def __init__(self, name, cover, songs, artist_id): + self.name = name + self.cover = cover + self.songs = songs + self.artist = artist_id -- cgit v1.2.3