拿我自己的csdn专栏
PHP篇
,作为Mark党,看到这么多好棒的文章(这样夸自己是不是很无耻捏?😛,其实我自己都脸红了捏🤗)怎么办,怎么办,怎么办?当然是马上M起来啊,可是一个一个复制粘贴,很不像猿的style,怎么办?跑程序喽!!
Markdown的链接格式是酱紫滴:[标题](url);最简单的想法就是
Python
+
selenium
,然后获取页面元素,拼成这个格式。
先看文档结构图:
思路一: 获取
<ul class="column_article_list">
下的所有
li
里的a链接,然后获取
<ul class="column_article_list">
下的所有
li
里的h2标题,再把两个list对应拼接成结果;思路没有问题,但操作比较麻烦,而且这个类如果用到其他页面的时候,如果这个li里有多个超链接,这个方法就不好使了。
思路二: 获取
<ul class="column_article_list">
下的所有
li
里的h2标题,再通过h2获取父级的父级即a链接,通用,方便,简单,但问题就是如何获取?直接上代码:
locator
=
(By.XPATH,
'//h2[@class="title"]'
)
titles
=
self
.timeout.until(EC.presence_of_all_elements_located(locator))
for
t
in
titles:
link
=
t.find_element(By.XPATH,
'.//..//..'
)
fmt
=
"[%s](%s)"
%
(t.text, link.get_attribute(
'href'
))
print
(fmt)
from
selenium
import
webdriver
from
selenium.webdriver.common.by
import
By
from
selenium.webdriver.support.ui
import
WebDriverWait
from
selenium.webdriver.support
import
expected_conditions as EC
class
CSDNMarkdown(
object
):
driver
=
None
timeout
=
None
album_url
=
""
waiter
=
None
def
__init__(
self
, album):
self
.album_url
=
album
self
.driver
=
webdriver.Firefox()
self
.driver.get(
self
.album_url)
self
.timeout
=
WebDriverWait(
self
.driver,
5
)
self
.waiter
=
WebDriverWait(
self
.driver,
2
)
def
turn(
self
):
self
.get_title()
p
=
self
.driver.current_url
while
p
is
not
None
:
p
=
self
.
next
()
self
.get_title()
def
next
(
self
):
try
:
locator
=
(By.CSS_SELECTOR,
'.js-page-next.js-page-action.ui-pager.ui-pager-disabled'
)
self
.waiter.until(EC.presence_of_element_located(locator))
return
None
except
:
try
:
locator
=
(By.CSS_SELECTOR,
'.js-page-next.js-page-action.ui-pager'
)
nextPage
=
self
.timeout.until(EC.presence_of_element_located(locator))
nextPage.click()
return
self
.driver.current_url
except
:
return
None
def
get_title(
self
):
titles
=
self
.driver.find_elements(By.XPATH,
'//h2[@class="title"]'
)
for
t
in
titles:
link
=
t.find_element(By.XPATH,
'.//..//..'
)
fmt
=
"[%s](%s)"
%
(t.text, link.get_attribute(
'href'
))
print
(fmt)
def
__del__(
self
):
self
.driver.close()
if
__name__
=
=
"__main__"
:
album_url
=
'https://blog.csdn.net/yageeart/article/category/854202'
app
=
CSDNMarkdown(album_url)
app.turn()