Trying to figure out how to use Beautiful Soup to parse multiple sub URLs from one main URL-CodePudding

Here is the main URL. https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data

I can collect strings that have this structure, into a list. /vsoch/hospital-chargemaster/0.0.2/data/baptist-health-system-(san-antonio)

The full file path looks something like this. https://raw.githubusercontent.com/vsoch/hospital-chargemaster/0.0.2/data/baptist-health-system-(san-antonio)/data-latest.tsv

My question is, how can I download a bunch of TSV files to my desktop in one go? I know some TSV files are pretty hard to pars, and I don't want to invest a lot of time getting at things that are hard to reach. I just want to get the code to download some/most TSV files to a folder on my desktop.

# main URL
# https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data

import requests
from bs4 import BeautifulSoup
import urllib

all_links = []
url = "https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data"
# Getting the webpage, creating a Response object.
response = requests.get(url)
# Extracting the source code of the page.
data = response.text
# Passing the source code to BeautifulSoup to create a BeautifulSoup object for it.
soup = BeautifulSoup(data, 'lxml')
# Extracting all the <a> tags into a list.
tags = soup.find_all('a')
# Extracting URLs from the attribute href in the <a> tags.
for tag in tags:
    all_links.append(tag.get('href'))

for item in all_links:
    item = item.replace('tree/','')
    print(item)
    try:
        DOWNLOAD_URL = 'https://raw.githubusercontent.com'   item   '/data-latest.tsv'
        print(DOWNLOAD_URL)
        r = requests.get(DOWNLOAD_URL)
        print(r)
        soup = BeautifulSoup(r.text, "html.parser")
        #print(soup)
        slash = DOWNLOAD_URL.find('/')   1
        DOWNLOAD_URL = DOWNLOAD_URL[0:-slash]
        DOWNLOAD_URL = DOWNLOAD_URL   slash
    except Exception as e: print(e)

CodePudding user response：

How about trying this:

import os.path

from shutil import copyfileobj

import requests
from bs4 import BeautifulSoup

MAIN_URL = "https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:96.0) Gecko/20100101 Firefox/96.0",
}


def make_soup(url_to_visit: str) -> list:
    return (
        BeautifulSoup(
            s.get(url_to_visit, headers=headers).text,
            "lxml",
        ).select(".Link--primary")
    )


def get_main_links() -> list:
    yield from [
        f'https://github.com{link["href"]}' for link
        in make_soup(MAIN_URL)
        if "0.0.2" in link["href"]
    ]


def get_tsv_links(data_link: str) -> list:
    yield from [
        [f'https://github.com{file["href"]}', data_link] for file
        in make_soup(data_link) if file["href"].endswith(".tsv")
    ]


def download_tsv_files(tsv_links: list):
    tsv_url, link = tsv_links
    file_name = tsv_url.split("/", -1)[-1]
    data_dir = link.split('/', -1)[-1]

    print(f"Getting {file_name} from {data_dir}...")
    if not os.path.exists(f"{download_dir}/{data_dir}"):
        os.makedirs(f"{download_dir}/{data_dir}", exist_ok=True)

    file_object = s.get(tsv_url, stream=True)
    with open(
            os.path.join(
                f"{download_dir}/{data_dir}", file_name
            ),
            "wb"
    ) as output:
        copyfileobj(file_object.raw, output)


if __name__ == "__main__":
    download_dir = "vsoch"
    os.makedirs(download_dir, exist_ok=True)

    with requests.Session() as s:
        for main_link in get_main_links():
            for tsv_data in get_tsv_links(main_link):
                download_tsv_files(tsv_data)

Output:

Getting data-2019.tsv from advent-health...
Getting data-latest.tsv from advent-health...
Getting data-2019.tsv from atlanticare-regional-medical-center...
Getting data-latest.tsv from atlanticare-regional-medical-center...
Getting data-2019.tsv from aurora-health-care-metro-inc....
Getting data-latest.tsv from aurora-health-care-metro-inc....
Getting data-2019.tsv from baptist-health-system-(san-antonio)...
Getting data-latest.tsv from baptist-health-system-(san-antonio)...
Getting data-2019.tsv from baptist-hospital-(miami)...
Getting data-latest.tsv from baptist-hospital-(miami)...

and so on...

Finally, you should have a directory called vsoch with the following structure:

vsoch
├── advent-health
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── atlanticare-regional-medical-center
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── aurora-health-care-metro-inc.
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── baptist-health-system-(san-antonio)
│   ├── data-2019.tsv
│   └── data-latest.tsv
├── baptist-hospital-(miami)
...