Home > Software design >  how to extract a link inside <script> tag
how to extract a link inside <script> tag

Time:02-07

I'm trying to get a .mp3 file in a link from TikTok sounds the problem is I can't extract it because it's inside <"script"> tag
I'm using pycurl instead of requests

all i need is to extract this from the response then extract the URL from UrlList" "playUrl":{"Uri":"musically-maliva-obj/7038595527327419141.mp3","UrlList":["https://sf16-ies-music-va.tiktokcdn.com/obj/musically-maliva-obj/7038595527327419141.mp3"]}

import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup


url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
#response = response.split('"')
#response = response[1]
#response = response.split('.html?')
#response= response[0]
a = response.split("'")  # gives me a list and i don't know how to search in it 
soup = BeautifulSoup(response, 'html.parser')  # cause response is a string
link = soup.find("script", id="sigi-persisted-data")  #i tried to use bs4 but i couldn't find a reasult
print(link)

CodePudding user response:

You can try extracting the json data, parse it to dictionary value and then navigate dictionary to get the data (json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])

import pycurl
from io import BytesIO
import certifi
from bs4 import BeautifulSoup
import re
import json


url = "https://vm.tiktok.com/ZML1t1vW7/"
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL, url)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.setopt(pycurl.HTTPHEADER, ["User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
body = buffer.getvalue()
response = body.decode('utf-8')
soup = BeautifulSoup(response, 'html.parser')
scripts = soup.findAll("script")

for s in scripts:
    s_str = str(s)
    res = re.search(r'<script>window.__INIT_PROPS__ = (.*)</script>', s_str)
    if res:
        json_data = json.loads(res.group(1))
        print(json_data["/music/*-:id"]["musicData"]["playUrl"]["UrlList"][0])

CodePudding user response:

You can use a regular expressions pattern:

import re
...

print(re.search(r'"playUrl":"(.*)"', str(soup)).group(1))
  •  Tags:  
  • Related