Example: How to download an audio + PDF from an url
By JoeVu, at: 18:09 Ngày 09 tháng 2 năm 2023
Downloading files from a URL is a common task in web scraping and automation. In Python, there are various libraries and tools that make this process efficient. In this article, we'll explore different approaches using the requests
and BeautifulSoup
, Scrapy
, Playwright
, and Selenium
libraries to download files from a given URL.
Given an example of https://event.choruscall.com/mediaframe/webcast.html?webcastid=370tVnvP&securityString=Eiys0PocsKYan5O3oWpFsYe3, we can download the audio file and the PDF file easily. Lets start
Approach 1: Using requests
and BeautifulSoup
Pre-installation
pip install requests
pip install beautifulsoup4
The full example
import requests
from bs4 import BeautifulSoup
import urllib.parse
import re
url = "https://event.choruscall.com/mediaframe/webcast.html?webcastid=370tVnvP&securityString=Eiys0PocsKYan5O3oWpFsYe3"
# Make a request to the URL
response = requests.get(url)
# Check if the request was successful (status code 200)
if response.status_code == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.text # collect the title text
# Find the links to the PDF and audio files
audio_link = soup.find("a", string=re.compile("Download Audio"))
pdf_link = soup.find("a", string=re.compile("Download Presentation"))
# Download the PDF file
if pdf_link:
pdf_url = urllib.parse.urljoin(url, pdf_link['href'])
pdf_response = requests.get(pdf_url)
extension = pathlib.Path(pdf_link['href']).suffix
with open(f"{title}.{extension}", 'wb') as pdf_file:
pdf_file.write(pdf_response.content)
# Download the audio file
if audio_link:
audio_url = urllib.parse.urljoin(url, audio_link['href'])
audio_response = requests.get(audio_url)
extension = pathlib.Path(audio_link['href']).suffix
with open(f"{title}.{extension}", 'wb') as audio_file:
audio_file.write(audio_response.content)
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
Approach 2: Using Scrapy
Pre-installation
pip install scrapy
The full example
import scrapy
from bs4 import BeautifulSoup
import re
import urllib
import requests
import pathlib
class DownloadSpider(scrapy.Spider):
name = 'download_spider'
start_urls = [
'https://event.choruscall.com/mediaframe/webcast.html?webcastid=370tVnvP&securityString=Eiys0PocsKYan5O3oWpFsYe3'
]
def parse(self, response):
# Add your Scrapy spider code to find and download files here
if response.status == 200:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.text
# Find the links to the PDF and audio files
audio_link = soup.find("a", string=re.compile("Download Audio"))
pdf_link = soup.find("a", string=re.compile("Download Presentation"))
# Download the PDF file
if pdf_link:
pdf_url = urllib.parse.urljoin(response.url, pdf_link['href'])
pdf_response = requests.get(pdf_url)
extension = pathlib.Path(pdf_link['href']).suffix
with open(f"{title}.{extension}", 'wb') as pdf_file:
pdf_file.write(pdf_response.content)
# Download the audio file
if audio_link:
audio_url = urllib.parse.urljoin(response.url, audio_link['href'])
audio_response = requests.get(audio_url)
extension = pathlib.Path(audio_link['href']).suffix
with open(f"{title}.{extension}", 'wb') as audio_file:
audio_file.write(audio_response.content)
else:
print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
Run the command to execute the scrapy spider: scrapy runspider my_spider.py
Approach 3: Using Playwright
Pre-installation
pip install pytest-playwright
playwright install
The full example
# Create a new file download_files.py
from bs4 import BeautifulSoup
import re
import urllib
import requests
import pathlib
from playwright.sync_api import Page
def test_has_title(page: Page):
page.goto('https://event.choruscall.com/mediaframe/webcast.html?webcastid=370tVnvP&securityString=Eiys0PocsKYan5O3oWpFsYe3')
# Expect a title "to contain" a substring.
# Parse the HTML content of the page
soup = BeautifulSoup(page.content(), 'html.parser')
title = soup.title.text
# Find the links to the PDF and audio files
audio_link = soup.find("a", string=re.compile("Download Audio"))
pdf_link = soup.find("a", string=re.compile("Download Presentation"))
# Download the PDF file
if pdf_link:
pdf_url = urllib.parse.urljoin(page.url, pdf_link['href'])
pdf_response = requests.get(pdf_url)
extension = pathlib.Path(pdf_link['href']).suffix
with open(f"{title}.{extension}", 'wb') as pdf_file:
pdf_file.write(pdf_response.content)
assert pathlib.Path(f"{title}.{extension}").is_file() is True
# Download the audio file
if audio_link:
audio_url = urllib.parse.urljoin(page.url, audio_link['href'])
audio_response = requests.get(audio_url)
extension = pathlib.Path(audio_link['href']).suffix
with open(f"{title}.{extension}", 'wb') as audio_file:
audio_file.write(audio_response.content)
assert pathlib.Path(f"{title}.{extension}").is_file() is True
Run the command to execute the scrapy spider: pytest download_files.py
Approach 4: Using Selenium
Pre-installation
pip install selenium
playwright install
The full example
# Create a file download_files_selenium.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import urllib
import pathlib
import requests
url = 'https://event.choruscall.com/mediaframe/webcast.html?webcastid=370tVnvP&securityString=Eiys0PocsKYan5O3oWpFsYe3'
driver = webdriver.Firefox()
driver.get(url)
title = driver.title
# Find the links to the PDF and audio files
audio_links = driver.find_elements(By.XPATH, '//a[contains(text(), "Download Audio")]')
pdf_links = driver.find_elements(By.XPATH, '//a[contains(text(), "Download Presentation")]')
# Download the PDF file
if pdf_links:
pdf_link = pdf_links[0].get_property('href')
pdf_url = urllib.parse.urljoin(url, pdf_link)
pdf_response = requests.get(pdf_url)
extension = pathlib.Path(pdf_link).suffix
with open(f"{title}.{extension}", 'wb') as pdf_file:
pdf_file.write(pdf_response.content)
# Download the audio file
if audio_links:
audio_link = audio_links[0].get_property('href')
audio_url = urllib.parse.urljoin(url, audio_link)
audio_response = requests.get(audio_url)
extension = pathlib.Path(audio_link).suffix
with open(f"{title}.{extension}", 'wb') as audio_file:
audio_file.write(audio_response.content)
driver.close()
Run the command to execute the scrapy spider: python download_files_selenium.py
Conclusion
In this article, we explored multiple approaches to download files from a URL using Python. Each method, whether it's leveraging the simplicity of requests
and BeautifulSoup
, the robustness of Scrapy
, the headless browser automation with Playwright
, or the dynamic capabilities of Selenium
, provides a unique set of tools for handling various web scraping and file downloading scenarios.
The choice of approach depends on the specific requirements of your project, the complexity of the target webpage, and your familiarity with the respective libraries. As you embark on your file downloading journey, remember to adapt the code snippets according to the structure of the webpage you are working with.
With these powerful Python tools at your disposal, you can efficiently retrieve files from URLs, making web automation and data extraction tasks more accessible and customizable. Happy coding, and may your Python scripts download files seamlessly from the vast landscape of the internet!