summaryrefslogtreecommitdiff
path: root/scrap/scrap.py
blob: d979f49ee96ff54e68412448a460b9340824b8a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from bs4 import BeautifulSoup
import re
# import os
import sys
import requests


def scrape_song_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics = str(html.find(
                'div',
                # class_='lyrics-root-pin-spacer',
                class_='Lyrics__Container-sc-1ynbvzw-1 kUgSbL'
    ))
    # remove identifiers like chorus, verse, etc
    lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
    lyrics = re.sub(r'<.*?>', '\n', lyrics)
    # remove empty lines
    # lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])
    lyrics = "\n".join([s for s in lyrics.split("\n") if s])
    return lyrics


print(scrape_song_lyrics(sys.argv[1]))