I have written a scraper in Python 3 using Beautiful Soup 4 to retrieve the latest version of Plex Media Server from https://plex.tv, and I'd like some feedback on how to improve it.
The HTML the parser is to be used on can be found at here (the section of code that downloads the HTML itself is not included in the parser).
This is the first time I've written any type of scraper / parser, and I feel my current code is rather messy with some parts being partially redundant.
parser.py
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from os import path
import os
import re
from .version import PlexVersion
class PlexVersionParser(object):
def __init__(self, html):
self._html = html
self._soup = BeautifulSoup(self.html, 'html.parser')
self._versions = []
@property
def html(self):
return self._html
@property
def soup(self):
return self._soup
@property
def versions(self):
return self._versions
def _create_version(self, version_string, platform, name, address):
version_string = 'Unknown' if version_string is None else version_string
platform = 'Unknown' if platform is None else platform
name = 'Unknown' if name is None else name
address = 'Unknown' if address is None else address
version = PlexVersion(version_string, platform, name, address)
self.versions.append(version)
def _parse_download_link(self, platform, name, address):
platform = re.sub(r'^Plex Media Server for ', '', platform)
name = re.sub(r'^Download ?', '', name)
if len(name) == 0:
name = None
url = urlparse(address)
path_pieces = path.normpath(url.path).split(os.sep)
self._create_version(path_pieces[2], platform, name, address)
def _parse_download_links(self, title, links, prefix=None):
for link in links:
name = link.text if prefix is None else prefix + ' ' + link.text
address = link['href']
self._parse_download_link(title, name, address)
def _parse_tab(self, tab):
title = tab.find(class_="title").text
linkParent = tab.find(class_="pop-btn", recursive=False)
if linkParent is None:
linkParent = tab.find(class_="os", recursive=False)
for link in linkParent.findAll("li"):
subTitle = link.find("h3").text
subLinkParent = link.find(class_="pop-btn", recursive=False)
subLink = subLinkParent.findAll("a", recursive=False)
self._parse_download_links(title, subLink, subTitle)
else:
links = linkParent.findAll("a", recursive=False)
self._parse_download_links(title, links)
def _parse_tabs(self, tabs):
for tab in tabs:
self._parse_tab(tab)
def _parse_section(self, section):
tabs = section.findAll("div", id=re.compile("tabs-[0-9]+"))
self._parse_tabs(tabs)
def _parse_sections(self):
sections = self.soup.findAll("div", id=re.compile("pms-.*"))
for section in sections:
self._parse_section(section)
def parse(self):
self._parse_sections()
version.py
def _parse_version(version_string):
pieces = version_string.split('.')
if len(pieces) != 5:
raise ValueError('invalid version string')
major_version = int(pieces[0])
minor_version = int(pieces[1])
patch_version = int(pieces[2])
pre_release_version = int(pieces[3])
build_metadata = pieces[4]
return (major_version,
minor_version,
patch_version,
pre_release_version,
build_metadata)
class SemanticVersion(object):
def __init__(self, version_string):
self._version = _parse_version(version_string)
@property
def version(self):
return self._version
@property
def major_version(self):
return self[0]
@property
def minor_version(self):
return self[1]
@property
def patch_version(self):
return self[2]
@property
def pre_release_version(self):
return self[3]
@property
def build_metadata(self):
return self[4]
def __str__(self):
return '.'.join(map(str, self.version))
def __repr__(self):
return str(self)
def __getitem__(self, index):
return self.version[index]
def __eq__(self, other):
return all([self[k] == other[k] for k in range(3)])
def __gt__(self, other):
if self == other:
return False
return not any([self[k] <= other[k] for k in range(3)])
class PlexVersion(SemanticVersion):
def __init__(self, version_string, platform, name, address):
super().__init__(version_string)
self._platform = platform
self._name = name
self._address = address
@property
def platform(self):
return self._platform
@property
def name(self):
return self._name
@property
def address(self):
return self._address
def __str__(self):
version = super().__str__()
return '{} {} v{}'.format(self.platform, self.name, version)
My question is mainly about parser.py
, but any feedback on my semantic version implementation is very welcome.
The main "irks" I have with my parser are the following:
- The
_parse_tab
method handles the specialos
class case in a way I'm not too fond of. However, I am not sure on any other ways of handling it. - The way
_create_version
handlesNone
cases looks rather verbose to me, but I don't know of a better way of creating a "default" value.