本文来源吾爱破解论坛
本帖最后由 mikeee 于 2020-2-3 15:23 编辑
[Python] 纯文本查看 复制代码
''' fetch_book.py pip install tqdm loguru pyquery selenium # 下载安装对应的 chromedriver https://chromedriver.chromium.org/downloads python fetch_book.py 爬书 Python+TensorFlow机器学习实战 https://lib-nuanxin.wqxuetang.com/read/pdf/3208943 为避免拉黑IP,不时设了较长sleep时间…… 爬完约需6小时——装做人类在读这本书,6小时看完 ''' from time import sleep from random import random, randint from pyquery import PyQuery as pq from loguru import logger from tqdm import tqdm # import httpx # from get_chrome_driver import get_chrome_driver # pylint: disable=invalid-name from selenium import webdriver # from selenium.webdriver.support.wait import WebDriverWait # import chromedriver_binary # chrome 76.x # from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17' # noqa def get_chrome_driver(timeout=120, headless=True): ''' start a Selenium Chrome driver timeout=120; headless=True ''' chrome_options0 = Options() chrome_options0.add_argument(f'user-agent={UA}') chrome_options0.add_argument("--headless") chrome_options1 = Options() chrome_options1.add_argument(f'user-agent={UA}') driver_ = '' try: # driver_ = webdriver.PhantomJS(exe, desired_capabilities=dcap) if headless: driver_ = webdriver.Chrome( chrome_options=chrome_options0, ) else: driver_ = webdriver.Chrome( chrome_options=chrome_options1, ) driver_.set_page_load_timeout(timeout + 2) except Exception as exc: logger.warning("webdriver.Chrome Exception: %s", exc) return driver_ url = 'https://lib-nuanxin.wqxuetang.com' book_url = f'{url}/read/pdf/3208943' # resp = httpx.get(book_url) # to rid of the browser, set headless to True driver = get_chrome_driver(headless=False) assert driver, ' Make sure you have chromedriver in your Path.' driver.get(book_url) sleep(5) # class_name = 'page-head-right' # full screen # driver.find_element_by_class_name(class_name).click() class_name = 'page-head-tol' doc = pq(driver.page_source) tol = doc(f'.{class_name}').text() total = tol.split('/') assert len(total) == 2, ' need to finetune ' total = total[1].strip() try: tot_page = int(total) except Exception as exc: logger.error(exc) raise SystemError(' Something is wrong, need fine tune') page = '10' # driver.find_element_by_class_name(class_name).click() # driver.find_element_by_class_name(class_name).click() # for page in tqdm([20, 30, 40, ]) # 只爬这几页 for page in tqdm(range(1, tot_page + 1)): # 删掉以下两行,爬全部页面 if page > 5: break # page = 60 driver.get(book_url) # logger.info(' Sleeping 10 s...') sleep(10 + random()) driver.find_element_by_class_name('el-input__inner').clear() sleep(0.5) driver.find_element_by_class_name('el-input__inner').send_keys(f'{page}\n') sleep_ = 10 + randint(25, 45) + random() logger.info(' Sleeping %.2f s' % sleep_) sleep(sleep_) doc = pq(driver.page_source) doc.remove_namespaces() indices = [pq(elm) for elm in doc('*') if pq(elm).attr('index')] # [pq(elm)('img').attr('src') for elm in indices] # # img_urls = doc('.page-img') # # assert len(img_urls) > total - 1, 'need to fine tune...' # pq(img_urls[2])('.page-img').attr('src') # https://lib-nuanxin.wqxuetang.com/page/img/3208943/248?k=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJwIjoyNDgsInQiOjE1ODA2Mjg0NDkwMDAsImIiOiIzMjA4OTQzIiwidyI6MTAwMCwiayI6IntcInVcIjpcIk9lRWNjV25FTEZrPVwiLFwiaVwiOlwiaVR2emc0WXBTVmlMeVh3cmE3UnM5Zz09XCIsXCJ0XCI6XCJGQk5WNDBNUGVHQUk1ZVc0YWZGZDd3PT1cIixcImJcIjpcIjRVNDd1RnUxMzRrPVwiLFwiblwiOlwiMnBYQjZuWUJ4dTQ9XCJ9IiwiaWF0IjoxNTgwNjI4NDQ5fQ.dNJ9Ze6PTOUtU0No-x3s5kWVor3I065fFvWdB4PQrT4 rel_url = pq(indices[page - 1])('img').attr('src') if rel_url.startswith('http'): dl_url = rel_url else: dl_url = f"{url}{rel_url}" logger.debug(' dl_url: %s' % dl_url) filename = f'{page:03d}.png' # cookies = driver.get_cookies() # cookies_dict = dict([(elm['name'], elm['value']) for elm in cookies]) count = 0 while count < 3: try: driver.get(dl_url) sleep(count * 2 + 10 + random()) break except Exception as exc: logger.error('error: %s, retyring %s' % (exc, count + 1)) count += 1 driver.save_screenshot(filename) logger.info(' Saved %s ' % filename) driver.quit()
爬 《Python+TensorFlow机器学习实战》https://lib-nuanxin.wqxuetang.com/read/pdf/3208943。爬别的书可能要做相应的修改。
觉得有用评个分回个帖什么的,欢迎分享改进版 —— 感觉拿到 cookies 后可以开 requests 或 httpx 直接下载图片, 不过折腾了一下没有成功。
请移步 https://github.com/gumblex/wqxt_pdf:高人做的, 貌似真的可用,至少在ubuntu里是可用的。
或参考另一个贴 “[Python] 文泉学堂的高清png下载” https://www.52pojie.cn/forum.php?mod=viewthread&tid=1097887&page=1&extra=#pid29706143
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: 斗图神器——python让你斗图无敌
- 下一篇: 获取类及函数工具