summaryrefslogtreecommitdiff
path: root/scrap
diff options
context:
space:
mode:
authorniliara-edu <nil.jimeno@estudiant.fjaverianas.com>2024-09-24 12:21:40 +0200
committerniliara-edu <nil.jimeno@estudiant.fjaverianas.com>2024-09-24 12:21:40 +0200
commit2e8ddbf422a3a6d75b1e1f73cad089111f801c05 (patch)
treea3b2c7ea8560fcbc89fb9b3ee15933d434b1ff51 /scrap
parentfaa13839f898c60ff5618be6e916ad2e60958468 (diff)
finished web scrapper
Diffstat (limited to 'scrap')
-rw-r--r--scrap/.gitignore3
-rw-r--r--scrap/Lyrics_apathy.json2
-rw-r--r--scrap/albums.py49
-rw-r--r--scrap/api.py15
-rw-r--r--scrap/database.py117
-rw-r--r--scrap/main.py53
-rw-r--r--scrap/parser.py83
-rw-r--r--scrap/scrap.py25
-rw-r--r--scrap/structures.py12
9 files changed, 322 insertions, 37 deletions
diff --git a/scrap/.gitignore b/scrap/.gitignore
new file mode 100644
index 0000000..6922c3e
--- /dev/null
+++ b/scrap/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+*.json
+covers/
diff --git a/scrap/Lyrics_apathy.json b/scrap/Lyrics_apathy.json
index 0272d47..29c06ee 100644
--- a/scrap/Lyrics_apathy.json
+++ b/scrap/Lyrics_apathy.json
@@ -860,4 +860,4 @@
}
}
]
-}
+} \ No newline at end of file
diff --git a/scrap/albums.py b/scrap/albums.py
new file mode 100644
index 0000000..791b962
--- /dev/null
+++ b/scrap/albums.py
@@ -0,0 +1,49 @@
+album_data = [
+ ["astrophysics", [
+ "apathy",
+ "The Unending Need For Perpetual Motion",
+ "HOPE LEFT ME Complete Version",
+ "Selected Tragic",
+ "Cute Tragedies",
+ "selected apathetic",
+ "ENTITY",
+ ]],
+ ["sadsvit", [
+ "Неонова Мрія Neon Dream",
+ "20 21",
+ "Cassette",
+ "Суматоха Bustle",
+ ]],
+ ["molchat doma", [
+ "Белая Полоса White Stripe",
+ "Монумент Monument",
+ "Этажи Floors",
+ "С крыш наших домов From our houses rooftops",
+ ]],
+ ["amesoeurs", [
+ "Amesoeurs",
+ ]],
+ ["severoth", [
+ "Vsesvit",
+ "Winterfall",
+ ]],
+ ["sadness", [
+ "Circle of Veins",
+ ]],
+]
+
+# ["astrophysics", "apathy"],
+# ["astrophysics", "The Unending Need For Perpetual Motion"],
+# ["astrophysics", "HOPE LEFT ME Complete Version"],
+# ["astrophysics", "Selected Tragic"],
+# ["astrophysics", "Cute Tragedies"],
+# ["astrophysics", "selected apathetic"],
+# ["astrophysics", "ENTITY"],
+# ["sadsvit", "Неонова Мрія Neon Dream"],
+# ["sadsvit", "20 21"],
+# ["sadsvit", "Casette"],
+# ["sadsvit", "Суматоха Bustle"],
+# ["molchat doma", "Белая Полоса White Stripe"],
+# ["molchat doma", "Монумент Monument"],
+# ["molchat doma", "Этажи Floors"],
+# ["molchat doma", "С крыш наших домов From our houses’ rooftops"],
diff --git a/scrap/api.py b/scrap/api.py
new file mode 100644
index 0000000..af83bb7
--- /dev/null
+++ b/scrap/api.py
@@ -0,0 +1,15 @@
+import lyricsgenius
+
+
+genius = lyricsgenius.Genius(
+ "0uSA9UFGsiO2WozVmbWPhyhOoVmUNuM3PXRt9rvWhptHBMgSO5CZBxGUMkwet5mv"
+)
+
+
+def download_albums(albums):
+ [get_album_json(artist[0], album) for artist in albums for album in artist[1]]
+
+
+def get_album_json(artist_name, album_name):
+ album = genius.search_album(album_name, artist_name)
+ return album.save_lyrics()
diff --git a/scrap/database.py b/scrap/database.py
new file mode 100644
index 0000000..3934361
--- /dev/null
+++ b/scrap/database.py
@@ -0,0 +1,117 @@
+import mysql.connector
+
+connector = mysql.connector.connect(
+ host="localhost",
+ user="root",
+)
+
+cursor = connector.cursor()
+
+
+def setup():
+ cursor.execute("CREATE DATABASE IF NOT EXISTS balalaika;")
+ cursor.execute("USE balalaika;")
+ cursor.execute("DROP TABLE IF EXISTS song;")
+ cursor.execute("DROP TABLE IF EXISTS album;")
+ cursor.execute("DROP TABLE IF EXISTS artist;")
+
+ cursor.execute("""
+ CREATE TABLE artist (
+ id int NOT NULL AUTO_INCREMENT,
+ name varchar(255),
+
+ PRIMARY KEY (id)
+ );
+ """)
+
+ cursor.execute("""
+ CREATE TABLE album (
+ id int NOT NULL AUTO_INCREMENT,
+ name varchar(255),
+ cover varchar(510),
+ artist_id int,
+
+ PRIMARY KEY (id),
+ FOREIGN KEY (artist_id) REFERENCES artist(id)
+ );
+ """)
+
+ cursor.execute("""
+ CREATE TABLE song (
+ id int NOT NULL AUTO_INCREMENT,
+ name varchar(255),
+ lyrics TEXT,
+
+ album_id int,
+
+ PRIMARY KEY (id),
+ FOREIGN KEY (album_id) REFERENCES album(id)
+ );
+ """)
+
+ cursor.execute("ALTER TABLE song CONVERT TO CHARACTER SET utf8")
+ cursor.execute("ALTER TABLE album CONVERT TO CHARACTER SET utf8")
+ cursor.execute("ALTER TABLE artist CONVERT TO CHARACTER SET utf8")
+
+
+def process_albums(album_list):
+ [process_album(album, album_id)
+ for album_id, album in enumerate(album_list)]
+
+
+def process_album(album, album_id):
+ upload_album(album)
+ [upload_song(song, album_id+1) for song in album.songs]
+
+
+def upload_album(album):
+ cursor.execute("""
+ INSERT INTO album (
+ name, cover, artist_id
+ )
+ VALUES (
+ %(name)s, %(cover)s, %(artist_id)s
+ );
+ """, {
+ 'name': album.name,
+ 'cover': album.cover,
+ 'artist_id': album.artist
+ })
+
+
+def upload_song(song, album_id):
+ cursor.execute("""
+ INSERT INTO song (
+ name, lyrics, album_id
+ )
+ VALUES (
+ %(name)s, %(lyrics)s, %(album_id)s
+ )
+ """, {
+ 'name': song.name,
+ 'lyrics': song.lyrics,
+ 'album_id': album_id
+ })
+
+
+def process_artists(artist_names):
+ [process_artist(artist) for artist in artist_names]
+
+
+def process_artist(artist):
+ cursor.execute("""
+ INSERT INTO artist (
+ name
+ )
+ VALUES (
+ %(name)s
+ )
+ """, {
+ 'name': artist,
+ })
+
+
+def close():
+ cursor.close()
+ connector.commit()
+ connector.close()
diff --git a/scrap/main.py b/scrap/main.py
index 80f4d24..72a9929 100644
--- a/scrap/main.py
+++ b/scrap/main.py
@@ -1,18 +1,49 @@
-import lyricsgenius
-import sys
+from albums import album_data
+import api
+import parser
+import database
-def getAlbum(artist_name, album_name):
- genius = lyricsgenius.Genius(
- "0uSA9UFGsiO2WozVmbWPhyhOoVmUNuM3PXRt9rvWhptHBMgSO5CZBxGUMkwet5mv"
+def start():
+ print("do you want to download the json data from genius? [y, n] ", end="")
+ if input().lower() == "y":
+ api.download_albums(album_data)
+ print("download finished!")
+
+ database.setup()
+ upload_albums(
+ get_album_data(),
+ get_artist_names()
)
- album = genius.search_album(artist_name, album_name)
- album.save_lyrics()
+ database.close()
+ print("upload finished!")
+ print("remember to move the covers directory once you're done!")
+
+
+def upload_albums(album_data, artist_names):
+ database.process_artists(artist_names)
+ database.process_albums(album_data)
+
+
+def get_album_data():
+ album_data = []
+ artist_id = 0
+
+ for artist in album_data:
+ artist_id += 1
+
+ for album in artist[1]:
+ album_data.append(parser.process_json_file(album, artist_id))
+
+ return album_data
-def getJsonLyrics(link):
- for key, value in data.items():
- print(key, value)
+def get_artist_names():
+ artist_data = []
+ [artist_data.append(
+ artist[0]
+ ) for artist in album_data]
+ return artist_data
-getJsonLyrics(sys.argv[1])
+start()
diff --git a/scrap/parser.py b/scrap/parser.py
new file mode 100644
index 0000000..618f642
--- /dev/null
+++ b/scrap/parser.py
@@ -0,0 +1,83 @@
+import json
+import re
+import structures
+import os
+
+
+def process_json_file(name, artist_id):
+ link = getLink(name)
+
+ file_json = open(link, "r")
+ album_json = file_json.read()
+ file_json.close()
+
+ return process_json(album_json, artist_id)
+
+
+def getLink(name):
+ return "Lyrics_"+name.replace(" ", "")+".json"
+
+
+def process_json(album_json, artist_id):
+ data = json.loads(album_json)
+ name = data["name"].lower()
+
+ artist_name_api = data["artist"]["name"].lower()
+ artist_name = artist_name_api.replace(" ", "")
+ off_cover = data["cover_art_thumbnail_url"]
+
+ new_cover = get_cover_link(artist_name, name)
+ download_cover(off_cover, new_cover, artist_name)
+
+ songs = [analyze_song(song) for song in data["tracks"]]
+ return structures.album(name, new_cover, songs, artist_id)
+
+
+def get_cover_link(artist_name, album_name):
+ album_name = album_name.replace(" ", "")
+ location = f"covers/{artist_name}/{album_name}.png"
+
+ return location
+
+
+def download_cover(off_cover, new_cover, artist_name):
+ if not os.path.isfile("covers"):
+ os.system("mkdir covers")
+
+ if not os.path.isfile(f"covers/{artist_name}"):
+ os.system(f"mkdir 'covers/{artist_name}'")
+
+ if not os.path.isfile(new_cover):
+ os.system(f"wget {off_cover} -O '{new_cover}'")
+
+
+def analyze_song(song):
+ name = song["song"]["title"]
+ name = name.lower()
+
+ lyrics = song["song"]["lyrics"]
+ lyrics = format_lyrics(lyrics)
+
+ return structures.song(name, lyrics)
+
+
+def format_lyrics(lyrics):
+ if lyrics != "":
+ lyrics = lyrics.split("Lyrics")[1].lstrip()
+ lyrics = lyrics.split("Embed")[0].rstrip()
+ lyrics = lyrics.replace("You might also like", "")
+ lyrics = re.sub(r'See (.*?) LiveGet', 'liveget', lyrics)
+ lyrics = lyrics.split("liveget")[0].rstrip()
+
+ lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
+ lyrics = re.sub("\n{3,}", "\n\n", lyrics)
+
+ lyrics = lyrics.replace("\u0435", "e")
+ lyrics = lyrics.replace("\u2005", " ")
+
+ while lyrics[0] == '\n':
+ lyrics = lyrics[1:]
+
+ lyrics = lyrics.lower()
+
+ return lyrics
diff --git a/scrap/scrap.py b/scrap/scrap.py
deleted file mode 100644
index d979f49..0000000
--- a/scrap/scrap.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from bs4 import BeautifulSoup
-import re
-# import os
-import sys
-import requests
-
-
-def scrape_song_lyrics(url):
- page = requests.get(url)
- html = BeautifulSoup(page.text, 'html.parser')
- lyrics = str(html.find(
- 'div',
- # class_='lyrics-root-pin-spacer',
- class_='Lyrics__Container-sc-1ynbvzw-1 kUgSbL'
- ))
- # remove identifiers like chorus, verse, etc
- lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
- lyrics = re.sub(r'<.*?>', '\n', lyrics)
- # remove empty lines
- # lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])
- lyrics = "\n".join([s for s in lyrics.split("\n") if s])
- return lyrics
-
-
-print(scrape_song_lyrics(sys.argv[1]))
diff --git a/scrap/structures.py b/scrap/structures.py
new file mode 100644
index 0000000..b139489
--- /dev/null
+++ b/scrap/structures.py
@@ -0,0 +1,12 @@
+class song:
+ def __init__(self, name, lyrics):
+ self.name = name
+ self.lyrics = lyrics
+
+
+class album:
+ def __init__(self, name, cover, songs, artist_id):
+ self.name = name
+ self.cover = cover
+ self.songs = songs
+ self.artist = artist_id