Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raw images first #76

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
25 changes: 18 additions & 7 deletions tumblr-photo-video-ripper.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,23 @@ def run(self):
def download(self, medium_type, post, target_folder):
try:
medium_url = self._handle_medium_url(medium_type, post)
if medium_url is not None:
self._download(medium_type, medium_url, target_folder)
#print("medium url is %s", medium_url)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this comment line.

resp_raw = requests.get(medium_url, stream=True, proxies=self.proxies, timeout=TIMEOUT)
if medium_type == "video":
self._download(medium_type, medium_url, target_folder, resp_raw)
elif medium_type == "photo":
medium_url_bak = medium_url
medium_url_dot = medium_url.split('.')
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The url parsing here seems complex and error-prone.

Below part is a better way. WDYT?

    def download(self, medium_type, post, target_folder):
        try:
            medium_url = self._handle_medium_url(medium_type, post)
            if medium_url is not None:
                if medium_type == "photo":
                    try:
                        # try to download raw image
                        medium_url_raw = medium_url.replace("68.media.tumblr.com", "data.tumblr.com")
                        raw_matched = self.hd_photo_regex.match(medium_url_raw)
                        if raw_matched is not None:
                            replace_raw = raw_matched.groups()[0]
                            replace_raw = replace_raw.replace(raw_matched.groups()[1], "raw")
                            medium_url_raw = medium_url_raw.replace(raw_matched.groups()[0], replace_raw)
                            self._download(medium_type, medium_url_raw, target_folder)
                            return
                    except:
                        pass
                self._download(medium_type, medium_url, target_folder)
        except TypeError:
            pass

    # can register differnet regex match rules
    def _register_regex_match_rules(self):
        # will iterate all the rules
        # the first matched result will be returned
        self.regex_rules = [video_hd_match(), video_default_match()]
        self.hd_photo_regex = re.compile(r".*(tumblr_\w+_(\d+))", re.IGNORECASE)

medium_url_underline = medium_url_dot[-2].split('_')
medium_url_raw = "http://data.tumblr."
for index in range(len(medium_url_underline) - 1):
medium_url_raw = medium_url_raw + medium_url_underline[index] + "_"
medium_url_raw = medium_url_raw + "raw." + medium_url_dot[-1]
if medium_url is not None:
self._download(medium_type, medium_url_raw, target_folder, resp_raw)
elif medium_url_bak is not None and resp_raw.status_code == 403:
resp= requests.get(medium_url_bak, stream=True, proxies=self.proxies, timeout=TIMEOUT)
self._download(medium_type, medium_url_bak, target_folder, resp)
except TypeError:
pass

Expand Down Expand Up @@ -99,7 +114,7 @@ def _handle_medium_url(self, medium_type, post):
"issues/new attached with below information:\n\n"
"%s" % post)

def _download(self, medium_type, medium_url, target_folder):
def _download(self, medium_type, medium_url, target_folder, resp):
medium_name = medium_url.split("/")[-1].split("?")[0]
if medium_type == "video":
if not medium_name.startswith("tumblr"):
Expand All @@ -115,10 +130,6 @@ def _download(self, medium_type, medium_url, target_folder):
retry_times = 0
while retry_times < RETRY:
try:
resp = requests.get(medium_url,
stream=True,
proxies=self.proxies,
timeout=TIMEOUT)
if resp.status_code == 403:
retry_times = RETRY
print("Access Denied when retrieve %s.\n" % medium_url)
Expand Down