[Python] PTT網站CRAWLER (純code,非說明文章)
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import datetime
import json
from selenium.webdriver.firefox.options import Options
import sys, os
timespan = str(int(datetime.datetime.now().timestamp()))
def get_htmltext():
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
#driver.get(‘https://www.pttweb.cc/bbs/NBA')
driver.get(‘https://www.pttweb.cc/bbs/NBA/search/t/Live')
time.sleep(1)
htmltext = “”
#設定range數字,影響文章取得數量(15=>大約100篇文章)
for i in range(17):
print(i)
y = 500 * (i + 1)
driver.execute_script(f”window.scrollTo(0, {y})”)
time.sleep(2)
htmltext += driver.page_source
driver.close()
file1 = open(“ptt_origin_”+timespan+”.txt”,”w”,encoding=”utf-8")
file1.write(htmltext)
file1.close()
return htmltext
def parse_htmltext(htmltext):
‘’’
解析臉書貼文與回覆的原始碼。
htmltext為原始碼,str
star_date為起始日期,datetime.datetime
end_date為結束日期,datetime.datetime
‘’’
result = []
print(‘into parse_htmltext’)
soup = BeautifulSoup(htmltext, ‘html.parser’)
#body = soup.find(‘body’)
body = soup.find_all(‘body’)
last_body = 0
if len(body) > 0:
last_body = len(body) — 1
print(‘last_body:’,str(last_body))
for body_idx in range(last_body):
posts = body[body_idx].select(‘div[class=”mt-2"]’)[0]
articles = posts.find_all(‘div’,’e7-container’)
print(‘e7-container:’,len(articles))
#設定取得多少文章
#range_count = 100
#if len(articles) < range_count:
# range_count = len(articles)
#print(‘range_count:’,range_count)
#index = 0
try:
for i in range(len(articles)):
print(i)
#if index > range_count:
# break
article_info = {
“status”:0,
“fail_msg”:””,
“index”:0,
“push_boo_count”:””,
“comment_count”:””,
“title”:””,
“content_link”:””,
“author”:””,
“time”:””,
“content”:””,
“comments”:[]
}
#推噓數
push_boo_count = articles[i].find_all(‘div’,’e7-recommendScore’)
if len(push_boo_count) > 0:
article_info[“push_boo_count”] = push_boo_count[0].text.replace(‘\n’,’’).strip()
print(push_boo_count[0].text.replace(‘\n’,’’).strip())
else:
continue
#留言數
comment_count = articles[i].find_all(‘div’,’e7-recommendCount’)
if len(comment_count) > 0:
article_info[“comment_count”] = comment_count[0].text.replace(‘\n’,’’).strip()
print(comment_count[0].text.replace(‘\n’,’’).strip())
else:
continue
#標題
title = articles[i].find_all(‘span’,’e7-show-if-device-is-not-xs’)
if len(title) > 0:
article_info[“title”] = title[0].text
print(title[0].text)
else:
continue
#文章連結
content_link = articles[i].find_all(‘a’)
if len(content_link) > 0:
article_info[“content_link”] = content_link[0].get(‘href’)
print(content_link[0].get(‘href’))
else:
continue
#作者
author = articles[i].find_all(‘span’,’grey — text e7-link-to-article’)
if len(author) > 0:
article_info[“author”] = author[0].text
print(author[0].text)
else:
continue
#時間
time = articles[i].find_all(‘span’,’text-no-wrap’)
if len(time) > 0:
print(time[1].text)
article_info[“time”] = time[1].text
else:
continue
#index
#article_info[“index”] = index + 1
result.append(article_info)
#index = index + 1
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
article_info[“status”] = 1
article_info[“fail_msg”] = “寫入時失敗:”,exc_type,fname,exc_tb.tb_lineno
result.append(article_info)
return result
#取出所有url文章和留言內容
def parse_url_content(articles):
print(‘\r\n\r\n parse_urllist_start \r\n\r\n’)
print(len(articles))
for article in articles:
try:
if article[“content_link”].strip() != “”:
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
print(‘https://www.pttweb.cc’ + article[“content_link”])
driver.get(‘https://www.pttweb.cc’ + article[“content_link"])
time.sleep(0.2)
scrl_count = 10
print(article[“comment_count”])
if article[“comment_count”].strip() != “” and article[“comment_count”].strip().isnumeric():
scrl_count = int(article[“comment_count”])
scrl_count_s = int(scrl_count / 10)
print(scrl_count)
for i in range(scrl_count_s):
print(str(i) + “/” + str(scrl_count_s))
y = 100 * (i + 1)
driver.execute_script(f”window.scrollTo(0, {y})”)
time.sleep(0.2)
def clickformore():
hrefbtns = driver.find_elements_by_tag_name(‘button’)
for btn in hrefbtns:
try:
s = btn.get_attribute(‘class’)
except:
continue
if s == ‘amber — text v-btn v-btn — outline v-btn — depressed theme — dark’:
try:
btn.click()
time.sleep(0.2)
except:
continue
clickformore()
clickformore()
htmltext = driver.page_source
driver.close()
#file1 = open(“ptt_commant_”+timespan+”.txt”,”w”,encoding=”utf-8")
#file1.write(htmltext)
#file1.close()
soup = BeautifulSoup(htmltext, ‘html.parser’)
body = soup.find(‘body’)
#文章內容
#content = body.select(‘div[class=”e7-main-content”]’)
content = body.find_all(‘div’,’e7-main-content’)
account = body.find_all(‘div’,’e7-author’)
push_boo = body.find_all(‘div’,’e7-left’)
comment = body.find_all(‘div’,’yellow — text text — darken-2 e7-recommend-message’)
w_time = body.find_all(‘div’,’e7-larger-than-xs ml-2 e7-ipdatetime2')
floor = body.find_all(‘span’,’e7-floor e7-xs’)
print(content[0].text)
if len(content) > 0:
print(‘content:’+str(len(content)))
article[“content”] = content[0].text
for i in range(scrl_count):
comment_body = {
“account”:””,
“push_boo”:””,
“comment”:””,
“time”:””,
“floor”:””
}
#留言帳號
#account = body.find_all(‘div’,’e7-author’)
print(“account_count:” + str(len(account)))
print(str(i)+”/”+str(scrl_count))
if i < len(account):
comment_body[“account”] = account[i].text
print(‘account:’+account[i].text)
#print(account[i].text)
#吹噓
#push_boo = body.find_all(‘div’,’e7-left’)
comment_body[“push_boo”] = push_boo[i].text
print(‘push_boo:’+push_boo[i].text)
#print(push_boo[i].text)
#留言內容
#comment = body.find_all(‘div’,’yellow — text text — darken-2 e7-recommend-message’)
comment_body[“comment”] = comment[i].text
print(‘comment:’+comment[i].text)
#print(comment[i].text)
#時間
#w_time = body.find_all(‘div’,’e7-larger-than-xs ml-2 e7-ipdatetime2')
comment_body[“time”] = w_time[i].text.split(‘,’)[0].strip()
print(‘time:’+w_time[i].text.split(‘,’)[0].strip())
#print(w_time[i].text.split(‘,’)[0].strip())
#樓層
comment_body[“floor”] = w_time[i].text.split(‘,’)[1].strip()
#floor = body.find_all(‘span’,’e7-floor e7-xs’)
print(‘floor:’+w_time[i].text.split(‘,’)[1].strip())
print(‘’)
article[“comments”].append(comment_body)
#for loop end
else:
continue
else:
continue
print(article)
except Exception as e:
print(e)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
return articles
def distinct_json_data(data):
if len(data) == 0:
return
limit_all = 100 #限制取得文章數量
limit_s = 1 #計算目前取得文章數
unique_data = []
for i in range(len(data)):
if len(unique_data) > 0:
flag_uni = False
for j in range(len(unique_data)):
if unique_data[j][“content_link”] == data[i][“content_link”]:
flag_uni = True
if flag_uni == False:
if limit_s < limit_all:
data[i][“index”] = limit_s
unique_data.append(data[i])
limit_s = limit_s + 1
else:
break
else:
unique_data.append(data[i])
if limit_s >= limit_all:
break
return unique_data
def make_file(jsons, filename):
file1 = open(filename+”_”+timespan+”.json”,”w”,encoding=”utf-8")
file1.write(json.dumps(jsons, indent=4, ensure_ascii=False))
file1.close()
if __name__ == ‘__main__’:
htmltext = get_htmltext()
get_all_article = parse_htmltext(htmltext)
make_file(get_all_article, ‘get_all_article’)
#去除重複的json資料,並且設idx
distinct_data = distinct_json_data(get_all_article)
make_file(distinct_data, ‘distinct_article’)
all_articles_commants = parse_url_content(distinct_data)
make_file(all_articles_commants, ‘all_articles_commants’)