Home > Software design >  Trying to figure out how to use Beautiful Soup to parse multiple sub URLs from one main URL
Trying to figure out how to use Beautiful Soup to parse multiple sub URLs from one main URL

Time:02-04

Here is the main URL. https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data

I can collect strings that have this structure, into a list. /vsoch/hospital-chargemaster/0.0.2/data/baptist-health-system-(san-antonio)

The full file path looks something like this. https://raw.githubusercontent.com/vsoch/hospital-chargemaster/0.0.2/data/baptist-health-system-(san-antonio)/data-latest.tsv

My question is, how can I download a bunch of TSV files to my desktop in one go? I know some TSV files are pretty hard to pars, and I don't want to invest a lot of time getting at things that are hard to reach. I just want to get the code to download some/most TSV files to a folder on my desktop.

# main URL
# https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data

import requests
from bs4 import BeautifulSoup
import urllib

all_links = []
url = "https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data"
# Getting the webpage, creating a Response object.
response = requests.get(url)
# Extracting the source code of the page.
data = response.text
# Passing the source code to BeautifulSoup to create a BeautifulSoup object for it.
soup = BeautifulSoup(data, 'lxml')
# Extracting all the <a> tags into a list.
tags = soup.find_all('a')
# Extracting URLs from the attribute href in the <a> tags.
for tag in tags:
    all_links.append(tag.get('href'))

for item in all_links:
    item = item.replace('tree/','')
    print(item)
    try:
        DOWNLOAD_URL = 'https://raw.githubusercontent.com'   item   '/data-latest.tsv'
        print(DOWNLOAD_URL)
        r = requests.get(DOWNLOAD_URL)
        print(r)
        soup = BeautifulSoup(r.text, "html.parser")
        #print(soup)
        slash = DOWNLOAD_URL.find('/')   1
        DOWNLOAD_URL = DOWNLOAD_URL[0:-slash]
        DOWNLOAD_URL = DOWNLOAD_URL   slash
    except Exception as e: print(e)
    

CodePudding user response:

How about trying this:

import os.path

from shutil import copyfileobj

import requests
from bs4 import BeautifulSoup

MAIN_URL = "https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:96.0) Gecko/20100101 Firefox/96.0",
}


def make_soup(url_to_visit: str) -> list:
    return (
        BeautifulSoup(
            s.get(url_to_visit, headers=headers).text,
            "lxml",
        ).select(".Link--primary")
    )


def get_main_links() -> list:
    yield from [
        f'https://github.com{link["href"]}' for link
        in make_soup(MAIN_URL)
        if "0.0.2" in link["href"]
    ]


def get_tsv_links(data_link: str) -> list:
    yield from [
        [f'https://github.com{file["href"]}', data_link] for file
        in make_soup(data_link) if file["href"].endswith(".tsv")
    ]


def download_tsv_files(tsv_links: list):
    tsv_url, link = tsv_links
    file_name = tsv_url.split("/", -1)[-1]
    data_dir = link.split('/', -1)[-1]

    print(f"Getting {file_name} from {data_dir}...")
    if not os.path.exists(f"{download_dir}/{data_dir}"):
        os.makedirs(f"{download_dir}/{data_dir}", exist_ok=True)

    file_object = s.get(tsv_url, stream=True)
    with open(
            os.path.join(
                f"{download_dir}/{data_dir}", file_name
            ),
            "wb"
    ) as output:
        copyfileobj(file_object.raw, output)


if __name__ == "__main__":
    download_dir = "vsoch"
    os.makedirs(download_dir, exist_ok=True)

    with requests.Session() as s:
        for main_link in get_main_links():
            for tsv_data in get_tsv_links(main_link):
                download_tsv_files(tsv_data)

Output:

Getting data-2019.tsv from advent-health...
Getting data-latest.tsv from advent-health...
Getting data-2019.tsv from atlanticare-regional-medical-center...
Getting data-latest.tsv from atlanticare-regional-medical-center...
Getting data-2019.tsv from aurora-health-care-metro-inc....
Getting data-latest.tsv from aurora-health-care-metro-inc....
Getting data-2019.tsv from baptist-health-system-(san-antonio)...
Getting data-latest.tsv from baptist-health-system-(san-antonio)...
Getting data-2019.tsv from baptist-hospital-(miami)...
Getting data-latest.tsv from baptist-hospital-(miami)...

and so on...

Finally, you should have a directory called vsoch with the following structure:

vsoch
├── advent-health
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── atlanticare-regional-medical-center
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── aurora-health-care-metro-inc.
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── baptist-health-system-(san-antonio)
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── baptist-hospital-(miami)
...
  •  Tags:  
  • Related