[Python] PTT網站CRAWLER (純code，非說明文章)

3 min readJul 14, 2022

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import datetime
import json
from selenium.webdriver.firefox.options import Options
import sys, os

timespan = str(int(datetime.datetime.now().timestamp()))

def get_htmltext():
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
#driver.get(‘https://www.pttweb.cc/bbs/NBA')
driver.get(‘https://www.pttweb.cc/bbs/NBA/search/t/Live')
time.sleep(1)
htmltext = “”
#設定range數字，影響文章取得數量(15=>大約100篇文章)
for i in range(17):
print(i)
y = 500 * (i + 1)
driver.execute_script(f”window.scrollTo(0, {y})”)
time.sleep(2)
htmltext += driver.page_source

driver.close()

file1 = open(“ptt_origin_”+timespan+”.txt”,”w”,encoding=”utf-8")
file1.write(htmltext)
file1.close()
return htmltext

def parse_htmltext(htmltext):
‘’’
解析臉書貼文與回覆的原始碼。
htmltext為原始碼，str
star_date為起始日期，datetime.datetime
end_date為結束日期，datetime.datetime
‘’’
result = []
print(‘into parse_htmltext’)

soup = BeautifulSoup(htmltext, ‘html.parser’)
#body = soup.find(‘body’)
body = soup.find_all(‘body’)
last_body = 0
if len(body) > 0:
last_body = len(body) — 1
print(‘last_body:’,str(last_body))

for body_idx in range(last_body):
posts = body[body_idx].select(‘div[class=”mt-2"]’)[0]
articles = posts.find_all(‘div’,’e7-container’)
print(‘e7-container:’,len(articles))

#設定取得多少文章
#range_count = 100
#if len(articles) < range_count:
# range_count = len(articles)
#print(‘range_count:’,range_count)
#index = 0
try:
for i in range(len(articles)):
print(i)
#if index > range_count:
# break

article_info = {
“status”:0,
“fail_msg”:””,
“index”:0,
“push_boo_count”:””,
“comment_count”:””,
“title”:””,
“content_link”:””,
“author”:””,
“time”:””,
“content”:””,
“comments”:[]
}

#推噓數
push_boo_count = articles[i].find_all(‘div’,’e7-recommendScore’)
if len(push_boo_count) > 0:
article_info[“push_boo_count”] = push_boo_count[0].text.replace(‘\n’,’’).strip()
print(push_boo_count[0].text.replace(‘\n’,’’).strip())
else:
continue
#留言數
comment_count = articles[i].find_all(‘div’,’e7-recommendCount’)
if len(comment_count) > 0:
article_info[“comment_count”] = comment_count[0].text.replace(‘\n’,’’).strip()
print(comment_count[0].text.replace(‘\n’,’’).strip())
else:
continue
#標題
title = articles[i].find_all(‘span’,’e7-show-if-device-is-not-xs’)
if len(title) > 0:
article_info[“title”] = title[0].text
print(title[0].text)
else:
continue
#文章連結
content_link = articles[i].find_all(‘a’)
if len(content_link) > 0:
article_info[“content_link”] = content_link[0].get(‘href’)
print(content_link[0].get(‘href’))
else:
continue
#作者
author = articles[i].find_all(‘span’,’grey — text e7-link-to-article’)
if len(author) > 0:
article_info[“author”] = author[0].text
print(author[0].text)
else:
continue
#時間
time = articles[i].find_all(‘span’,’text-no-wrap’)
if len(time) > 0:
print(time[1].text)
article_info[“time”] = time[1].text
else:
continue
#index
#article_info[“index”] = index + 1
result.append(article_info)
#index = index + 1

except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
article_info[“status”] = 1
article_info[“fail_msg”] = “寫入時失敗:”,exc_type,fname,exc_tb.tb_lineno
result.append(article_info)

return result

#取出所有url文章和留言內容
def parse_url_content(articles):
print(‘\r\n\r\n parse_urllist_start \r\n\r\n’)
print(len(articles))
for article in articles:
try:
if article[“content_link”].strip() != “”:
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

print(‘https://www.pttweb.cc’ + article[“content_link”])
driver.get(‘https://www.pttweb.cc’ + article[“content_link"])

time.sleep(0.2)
scrl_count = 10
print(article[“comment_count”])
if article[“comment_count”].strip() != “” and article[“comment_count”].strip().isnumeric():
scrl_count = int(article[“comment_count”])
scrl_count_s = int(scrl_count / 10)
print(scrl_count)
for i in range(scrl_count_s):
print(str(i) + “/” + str(scrl_count_s))
y = 100 * (i + 1)
driver.execute_script(f”window.scrollTo(0, {y})”)
time.sleep(0.2)

def clickformore():
hrefbtns = driver.find_elements_by_tag_name(‘button’)
for btn in hrefbtns:
try:
s = btn.get_attribute(‘class’)
except:
continue
if s == ‘amber — text v-btn v-btn — outline v-btn — depressed theme — dark’:
try:
btn.click()
time.sleep(0.2)
except:
continue
clickformore()
clickformore()
htmltext = driver.page_source
driver.close()

#file1 = open(“ptt_commant_”+timespan+”.txt”,”w”,encoding=”utf-8")
#file1.write(htmltext)
#file1.close()

soup = BeautifulSoup(htmltext, ‘html.parser’)
body = soup.find(‘body’)
#文章內容
#content = body.select(‘div[class=”e7-main-content”]’)
content = body.find_all(‘div’,’e7-main-content’)

account = body.find_all(‘div’,’e7-author’)
push_boo = body.find_all(‘div’,’e7-left’)
comment = body.find_all(‘div’,’yellow — text text — darken-2 e7-recommend-message’)
w_time = body.find_all(‘div’,’e7-larger-than-xs ml-2 e7-ipdatetime2')
floor = body.find_all(‘span’,’e7-floor e7-xs’)

print(content[0].text)
if len(content) > 0:
print(‘content:’+str(len(content)))
article[“content”] = content[0].text

for i in range(scrl_count):
comment_body = {
“account”:””,
“push_boo”:””,
“comment”:””,
“time”:””,
“floor”:””
}

#留言帳號
#account = body.find_all(‘div’,’e7-author’)
print(“account_count:” + str(len(account)))
print(str(i)+”/”+str(scrl_count))
if i < len(account):
comment_body[“account”] = account[i].text
print(‘account:’+account[i].text)
#print(account[i].text)
#吹噓
#push_boo = body.find_all(‘div’,’e7-left’)
comment_body[“push_boo”] = push_boo[i].text
print(‘push_boo:’+push_boo[i].text)
#print(push_boo[i].text)
#留言內容
#comment = body.find_all(‘div’,’yellow — text text — darken-2 e7-recommend-message’)
comment_body[“comment”] = comment[i].text
print(‘comment:’+comment[i].text)
#print(comment[i].text)
#時間
#w_time = body.find_all(‘div’,’e7-larger-than-xs ml-2 e7-ipdatetime2')
comment_body[“time”] = w_time[i].text.split(‘,’)[0].strip()
print(‘time:’+w_time[i].text.split(‘,’)[0].strip())
#print(w_time[i].text.split(‘,’)[0].strip())
#樓層
comment_body[“floor”] = w_time[i].text.split(‘,’)[1].strip()
#floor = body.find_all(‘span’,’e7-floor e7-xs’)
print(‘floor:’+w_time[i].text.split(‘,’)[1].strip())
print(‘’)

article[“comments”].append(comment_body)
#for loop end
else:
continue
else:
continue
print(article)
except Exception as e:
print(e)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)

return articles

def distinct_json_data(data):
if len(data) == 0:
return

limit_all = 100 #限制取得文章數量
limit_s = 1 #計算目前取得文章數
unique_data = []
for i in range(len(data)):
if len(unique_data) > 0:
flag_uni = False
for j in range(len(unique_data)):
if unique_data[j][“content_link”] == data[i][“content_link”]:
flag_uni = True
if flag_uni == False:
if limit_s < limit_all:
data[i][“index”] = limit_s
unique_data.append(data[i])
limit_s = limit_s + 1
else:
break
else:
unique_data.append(data[i])

if limit_s >= limit_all:
break

return unique_data

def make_file(jsons, filename):
file1 = open(filename+”_”+timespan+”.json”,”w”,encoding=”utf-8")
file1.write(json.dumps(jsons, indent=4, ensure_ascii=False))
file1.close()

if __name__ == ‘__main__’:
htmltext = get_htmltext()
get_all_article = parse_htmltext(htmltext)
make_file(get_all_article, ‘get_all_article’)
#去除重複的json資料，並且設idx
distinct_data = distinct_json_data(get_all_article)
make_file(distinct_data, ‘distinct_article’)
all_articles_commants = parse_url_content(distinct_data)
make_file(all_articles_commants, ‘all_articles_commants’)

[Python] PTT網站CRAWLER (純code，非說明文章)

Written by Charlie Chen (陳慶裕)