Here is the main URL. https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data
I can collect strings that have this structure, into a list. /vsoch/hospital-chargemaster/0.0.2/data/baptist-health-system-(san-antonio)
The full file path looks something like this. https://raw.githubusercontent.com/vsoch/hospital-chargemaster/0.0.2/data/baptist-health-system-(san-antonio)/data-latest.tsv
My question is, how can I download a bunch of TSV files to my desktop in one go? I know some TSV files are pretty hard to pars, and I don't want to invest a lot of time getting at things that are hard to reach. I just want to get the code to download some/most TSV files to a folder on my desktop.
# main URL
# https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data
import requests
from bs4 import BeautifulSoup
import urllib
all_links = []
url = "https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data"
# Getting the webpage, creating a Response object.
response = requests.get(url)
# Extracting the source code of the page.
data = response.text
# Passing the source code to BeautifulSoup to create a BeautifulSoup object for it.
soup = BeautifulSoup(data, 'lxml')
# Extracting all the <a> tags into a list.
tags = soup.find_all('a')
# Extracting URLs from the attribute href in the <a> tags.
for tag in tags:
all_links.append(tag.get('href'))
for item in all_links:
item = item.replace('tree/','')
print(item)
try:
DOWNLOAD_URL = 'https://raw.githubusercontent.com' item '/data-latest.tsv'
print(DOWNLOAD_URL)
r = requests.get(DOWNLOAD_URL)
print(r)
soup = BeautifulSoup(r.text, "html.parser")
#print(soup)
slash = DOWNLOAD_URL.find('/') 1
DOWNLOAD_URL = DOWNLOAD_URL[0:-slash]
DOWNLOAD_URL = DOWNLOAD_URL slash
except Exception as e: print(e)
CodePudding user response:
How about trying this:
import os.path
from shutil import copyfileobj
import requests
from bs4 import BeautifulSoup
MAIN_URL = "https://github.com/vsoch/hospital-chargemaster/tree/0.0.2/data"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:96.0) Gecko/20100101 Firefox/96.0",
}
def make_soup(url_to_visit: str) -> list:
return (
BeautifulSoup(
s.get(url_to_visit, headers=headers).text,
"lxml",
).select(".Link--primary")
)
def get_main_links() -> list:
yield from [
f'https://github.com{link["href"]}' for link
in make_soup(MAIN_URL)
if "0.0.2" in link["href"]
]
def get_tsv_links(data_link: str) -> list:
yield from [
[f'https://github.com{file["href"]}', data_link] for file
in make_soup(data_link) if file["href"].endswith(".tsv")
]
def download_tsv_files(tsv_links: list):
tsv_url, link = tsv_links
file_name = tsv_url.split("/", -1)[-1]
data_dir = link.split('/', -1)[-1]
print(f"Getting {file_name} from {data_dir}...")
if not os.path.exists(f"{download_dir}/{data_dir}"):
os.makedirs(f"{download_dir}/{data_dir}", exist_ok=True)
file_object = s.get(tsv_url, stream=True)
with open(
os.path.join(
f"{download_dir}/{data_dir}", file_name
),
"wb"
) as output:
copyfileobj(file_object.raw, output)
if __name__ == "__main__":
download_dir = "vsoch"
os.makedirs(download_dir, exist_ok=True)
with requests.Session() as s:
for main_link in get_main_links():
for tsv_data in get_tsv_links(main_link):
download_tsv_files(tsv_data)
Output:
Getting data-2019.tsv from advent-health...
Getting data-latest.tsv from advent-health...
Getting data-2019.tsv from atlanticare-regional-medical-center...
Getting data-latest.tsv from atlanticare-regional-medical-center...
Getting data-2019.tsv from aurora-health-care-metro-inc....
Getting data-latest.tsv from aurora-health-care-metro-inc....
Getting data-2019.tsv from baptist-health-system-(san-antonio)...
Getting data-latest.tsv from baptist-health-system-(san-antonio)...
Getting data-2019.tsv from baptist-hospital-(miami)...
Getting data-latest.tsv from baptist-hospital-(miami)...
and so on...
Finally, you should have a directory called vsoch with the following structure:
vsoch
├── advent-health
│ ├── data-2019.tsv
│ └── data-latest.tsv
├── atlanticare-regional-medical-center
│ ├── data-2019.tsv
│ └── data-latest.tsv
├── aurora-health-care-metro-inc.
│ ├── data-2019.tsv
│ └── data-latest.tsv
├── baptist-health-system-(san-antonio)
│ ├── data-2019.tsv
│ └── data-latest.tsv
├── baptist-hospital-(miami)
...
