summaryrefslogtreecommitdiff
path: root/scrap/parser.py
blob: c447a0d7dae3575a3cf3347a5b6b7abffab75c0a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
import re
import structures
import os


def process_json_file(name, album_id, artist_id):
    link = getLink(name)

    file_json = open(link, "r")
    album_json = file_json.read()
    file_json.close()

    return process_json(album_json, album_id, artist_id)


def getLink(name):
    return "Lyrics_"+name.replace(" ", "")+".json"


def process_json(album_json, album_id, artist_id):
    data = json.loads(album_json)
    album_name = data["name"].lower()

    off_cover = data["cover_art_thumbnail_url"]

    new_cover = get_cover_link(artist_id, album_id)
    download_cover(off_cover, new_cover, artist_id)

    songs = [analyze_song(song) for song in data["tracks"]]
    return structures.album(album_name, new_cover, songs, artist_id)


def get_cover_link(artist_id, album_id):
    cover_link = f"covers/{artist_id}/{album_id}.png"
    return cover_link


def download_cover(off_cover, new_cover, artist_id):
    if not os.path.isdir("covers"):
        os.system("mkdir covers")

    if not os.path.isdir(f"covers/{artist_id}"):
        os.system(f"mkdir 'covers/{artist_id}'")

    if not os.path.isfile(new_cover):
        os.system(f"wget {off_cover} -O '{new_cover}'")


def analyze_song(song):
    name = song["song"]["title"]
    name = name.lower()

    lyrics = song["song"]["lyrics"]
    lyrics = format_lyrics(lyrics)

    return structures.song(name, lyrics)


def format_lyrics(lyrics):
    if lyrics != "":
        lyrics = lyrics.split("Lyrics")[1].lstrip()
        lyrics = lyrics.split("Embed")[0].rstrip()
        lyrics = lyrics.replace("You might also like", "")
        lyrics = re.sub(r'See (.*?) LiveGet', 'liveget', lyrics)
        lyrics = lyrics.split("liveget")[0].rstrip()

        lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
        lyrics = re.sub("\n{3,}", "\n\n", lyrics)

        lyrics = lyrics.replace("\u2005", " ")

        while lyrics[0] == '\n':
            lyrics = lyrics[1:]

        lyrics = lyrics.lower()

    return lyrics