summaryrefslogtreecommitdiff
path: root/scrap/scrap.py
diff options
context:
space:
mode:
authornil0j <nil.jimeno@estudiant.fjaverianas.com>2024-09-21 10:17:23 +0200
committernil0j <nil.jimeno@estudiant.fjaverianas.com>2024-09-21 10:17:23 +0200
commitfaa13839f898c60ff5618be6e916ad2e60958468 (patch)
treed785b9e7c66d092c0e5fbc296382289f39fea501 /scrap/scrap.py
init
Diffstat (limited to 'scrap/scrap.py')
-rw-r--r--scrap/scrap.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/scrap/scrap.py b/scrap/scrap.py
new file mode 100644
index 0000000..d979f49
--- /dev/null
+++ b/scrap/scrap.py
@@ -0,0 +1,25 @@
+from bs4 import BeautifulSoup
+import re
+# import os
+import sys
+import requests
+
+
+def scrape_song_lyrics(url):
+ page = requests.get(url)
+ html = BeautifulSoup(page.text, 'html.parser')
+ lyrics = str(html.find(
+ 'div',
+ # class_='lyrics-root-pin-spacer',
+ class_='Lyrics__Container-sc-1ynbvzw-1 kUgSbL'
+ ))
+ # remove identifiers like chorus, verse, etc
+ lyrics = re.sub(r'[\(\[].*?[\)\]]', '', lyrics)
+ lyrics = re.sub(r'<.*?>', '\n', lyrics)
+ # remove empty lines
+ # lyrics = os.linesep.join([s for s in lyrics.splitlines() if s])
+ lyrics = "\n".join([s for s in lyrics.split("\n") if s])
+ return lyrics
+
+
+print(scrape_song_lyrics(sys.argv[1]))